Contents

1 Install required packages

Install bigWig and latticeExtra package

install.packages("devtools", quiet = TRUE)
library(devtools)
devtools::install_github('andrelmartins/bigWig',
              subdir='bigWig')
library(bigWig)

install.packages("DESeq2", quiet = TRUE)
install.packages("dplyr", quiet = TRUE)

Install bedtools

/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
brew install bedtools

Install Biostrings

if (!requireNamespace("Biostrings", quietly = TRUE)) {
  if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
  BiocManager::install("Biostrings")
}
## 'getOption("repos")' replaces Bioconductor standard repositories, see
## 'help("repositories", package = "BiocManager")' for details.
## Replacement repositories:
##     CRAN: https://cran.rstudio.com
## Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.3 (2024-02-29)
## Installing package(s) 'Biostrings'
## also installing the dependencies 'bitops', 'zlibbioc', 'RCurl', 'GenomeInfoDbData', 'BiocGenerics', 'S4Vectors', 'IRanges', 'XVector', 'GenomeInfoDb', 'crayon'
## 
## The downloaded binary packages are in
##  /tmp/RtmpAEQ7p6/downloaded_packages
## installing the source package 'GenomeInfoDbData'
## Old packages: 'boot'

Install latticeExtra

install.packages("latticeExtra", quiet = TRUE)
## also installing the dependencies 'Rcpp', 'deldir', 'RcppEigen', 'png', 'jpeg', 'RColorBrewer', 'interp'

2 Feb 5th

3 RSAT - DYAD analysis

In the previous analysis, I ran the RSAT-dyad analysis, using the ENCODE DHS regions as a control for calculating the expected 3mer dyad occurrence, and generated these files with dyad patterns and the corresponding statistics.
Command that I used:

rsat dyad-analysis -o GATA3_peak_161win_with_motif_1_RSAT_dyad.txt -i GATA3_peak_161win_with_motif_1.fasta -format FastA -l 3 -sp 0-20 -expfreq ENCODE.MCF7.DHS.background4.txt -return exp_occ,occ,ratio -sort -seqtype dna 


# -1str single strand count; only the direct strand is considered for oligonucleotide and dyad occurrence counting.
# -2str count on oth strands
        #The occurrences of each oligonucleotide are summed on both strands. This allows to detect elements which act in an orientation-insensitive way (as is generally the case for yeast upstream elements).
        
# -type dyad_type (dr|ir|any) any   (default)
        #In order to fasten execution, the program can be asked to restrict its analysis to symmetric dyads.
        #Three types are accepted
           #dr  direct repeats: the second element is the same as the first one
           #ir  inverted repeats: the second element is the reverse complement of the first one.
           #rep  repeats: direct and inverted repeats are evaluated
           #any (default)
             #When selecting the option any, the analysis is performed on all non-symmetric dyads as well.

Refer to the help menu.

cd /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/RSAT/
source ~/miniconda3/bin/activate
conda activate rsat
#rsat --help
rsat oligo-analysis -h
rsat create-background-model -h
rsat dyad-analysis -h

Files generated:
GATA3_peak_161win_with_motif_1_RSAT_dyad.txt
GATA3_peak_161win_with_motif_2_RSAT_dyad.txt
GATA3_peak_161win_with_motif_4_RSAT_dyad.txt
GATA3_peak_161win_with_motif_5_RSAT_dyad.txt
GATA3_peak_161win_with_motif_6_RSAT_dyad.txt

head -6 GATA3_peak_161win_with_motif_1_RSAT_dyad.txt
#sequence   identifier              expected_freq   occ exp_occ ovl_occall_occ  ratio
#gatn{3}atc gatn{3}atc|gatn{3}atc   0.0000716460535   12482   85.98 36  12518    145.17
#agan{4}atc agan{4}atc|gatn{4}tct   0.0003298219842   10904  390.07 4445    15349     27.95
#atan{2}atc atan{2}atc|gatn{2}tat   0.0001616794646   10453  196.05 1460    11913     53.32
#gatn{4}tca gatn{4}tca|tgan{4}atc   0.0003742452755    7872  442.61 1728    9600      17.79
#agan{3}tat agan{3}tat|atan{3}tct   0.0002979478048    7425  357.56 613 8038      20.77

The first column is the sequence pattern RSAT-dyad analysis found. Notice that the n{x} is the spacing between the elements of the dyad. The spacing is the number of bases between the end of the first element and the start of the second one.. This is different with the way we refer to as “relative distance” anchor at the G of two 3mer. We will modify this while processing the data.

The second column is all identifier for that specific pattern. Notice that while running the dyad analysis, we specified the parameter -1str to only count the direct strand. Thus, only the main structure and its reverse compliment (for both 3mer) is unquely listed in the result files. This is convenient for downstream analysis.

The final column represents the ratio of observed occurrences to expected occurrences (occ/exp_occ), which can serve as a rough “enrichment score”. I plan to extract patterns linked to GAT/ATC and create a bar chart or xyplot to visually depict their frequencies.
Notice that using this ratio may overestimate some patterns

awk -F'\t' '$1 ~ /^gat.*atc/' GATA3_peak_161win_with_motif_1_RSAT_dyad.txt | wc -l
#21

awk -F'\t' '$1 ~ /^atc.*gat/' GATA3_peak_161win_with_motif_1_RSAT_dyad.txt  | wc -l
#21

In RSAT-dyad analysis, gat-atc and atc-gat means two different structure.

awk -F'\t' '$1 ~ /^gat.*atc/' GATA3_peak_161win_with_motif_1_RSAT_dyad.txt | sort -k8,8nr
echo " "
awk -F'\t' '$1 ~ /^atc.*gat/' GATA3_peak_161win_with_motif_1_RSAT_dyad.txt | sort -k8,8nr
## gatn{3}atc   gatn{3}atc|gatn{3}atc   0.0000716460535   12482   85.98 36  12518    145.17
## gatn{0}atc   gatn{0}atc|gatn{0}atc   0.0000515638671     614   64.15 0   614    9.57
## gatn{4}atc   gatn{4}atc|gatn{4}atc   0.0000504302144     545   59.64 6   551    9.14
## gatn{16}atc  gatn{16}atc|gatn{16}atc 0.0000944795637     590   97.70 9   599    6.04
## gatn{15}atc  gatn{15}atc|gatn{15}atc 0.0000818438092     516   85.70 8   524    6.02
## gatn{20}atc  gatn{20}atc|gatn{20}atc 0.0000942591829     547   92.71 5   552    5.90
## gatn{10}atc  gatn{10}atc|gatn{10}atc 0.0000987860472     604  109.68 6   610    5.51
## gatn{17}atc  gatn{17}atc|gatn{17}atc 0.0000906426659     503   92.61 5   508    5.43
## gatn{8}atc   gatn{8}atc|gatn{8}atc   0.0000899257860     544  102.07 7   551    5.33
## gatn{13}atc  gatn{13}atc|gatn{13}atc 0.0000906646305     516   97.24 6   522    5.31
## gatn{14}atc  gatn{14}atc|gatn{14}atc 0.0000916101649     510   97.11 9   519    5.25
## gatn{6}atc   gatn{6}atc|gatn{6}atc   0.0000971154028     591  112.60 4   595    5.25
## gatn{11}atc  gatn{11}atc|gatn{11}atc 0.0000932630475     533  102.39 9   542    5.21
## gatn{12}atc  gatn{12}atc|gatn{12}atc 0.0000888395037     493   96.41 6   499    5.11
## gatn{9}atc   gatn{9}atc|gatn{9}atc   0.0000902432624     515  101.37 12  527    5.08
## gatn{18}atc  gatn{18}atc|gatn{18}atc 0.0000939778165     466   94.81 7   473    4.92
## gatn{19}atc  gatn{19}atc|gatn{19}atc 0.0000965793592     437   96.22 4   441    4.54
## gatn{7}atc   gatn{7}atc|gatn{7}atc   0.0000847463572     423   97.33 5   428    4.35
## gatn{1}atc   gatn{1}atc|gatn{1}atc   0.0000635633129     315   78.20 1   316    4.03
## gatn{5}atc   gatn{5}atc|gatn{5}atc   0.0000883000715     258  103.60 2   260    2.49
## gatn{2}atc   gatn{2}atc|gatn{2}atc   0.0000878513418     209  106.53 0   209    1.96
##  
## atcn{1}gat   atcn{1}gat|atcn{1}gat   0.0000365557250     915   44.97 6   921   20.35
## atcn{13}gat  atcn{13}gat|atcn{13}gat 0.0000915682647     986   98.21 3   989   10.04
## atcn{19}gat  atcn{19}gat|atcn{19}gat 0.0000783118818     524   78.02 10  534    6.72
## atcn{8}gat   atcn{8}gat|atcn{8}gat   0.0000890597270     659  101.08 10  669    6.52
## atcn{17}gat  atcn{17}gat|atcn{17}gat 0.0000858063102     546   87.67 8   554    6.23
## atcn{6}gat   atcn{6}gat|atcn{6}gat   0.0000802195944     562   93.01 4   566    6.04
## atcn{3}gat   atcn{3}gat|atcn{3}gat   0.0000953432975     686  114.42 1   687    6.00
## atcn{16}gat  atcn{16}gat|atcn{16}gat 0.0000842739153     521   87.15 3   524    5.98
## atcn{11}gat  atcn{11}gat|atcn{11}gat 0.0000917826817     581  100.76 7   588    5.77
## atcn{15}gat  atcn{15}gat|atcn{15}gat 0.0000847558549     510   88.75 7   517    5.75
## atcn{2}gat   atcn{2}gat|atcn{2}gat   0.0000822145577     559   99.69 0   559    5.61
## atcn{7}gat   atcn{7}gat|atcn{7}gat   0.0000795928625     495   91.41 3   498    5.42
## atcn{9}gat   atcn{9}gat|atcn{9}gat   0.0000988309276     589  111.02 2   591    5.31
## atcn{20}gat  atcn{20}gat|atcn{20}gat 0.0000944194877     481   92.87 5   486    5.18
## atcn{14}gat  atcn{14}gat|atcn{14}gat 0.0000920659368     505   97.60 7   512    5.17
## atcn{10}gat  atcn{10}gat|atcn{10}gat 0.0000804379701     458   89.30 3   461    5.13
## atcn{18}gat  atcn{18}gat|atcn{18}gat 0.0000928758991     475   93.70 3   478    5.07
## atcn{12}gat  atcn{12}gat|atcn{12}gat 0.0000918257056     501   99.65 2   503    5.03
## atcn{4}gat   atcn{4}gat|atcn{4}gat   0.0000924786758     548  109.37 0   548    5.01
## atcn{5}gat   atcn{5}gat|atcn{5}gat   0.0000961865213     534  112.85 2   536    4.73
## atcn{0}gat   atcn{0}gat|atcn{0}gat   0.0000150225440      44   18.69 0   44     2.35

Notice that:
1) the x value of n{x} (spacings) in RSAT is defined differently than in the customized analysis. For instance, gatn{3}atc has what we define in customized analysis as a 8bp relative distance between the G in gat and C in atc.
2) Notice the second column–“identifier”.

On the other hand, gat-gat and atc-atc means same structure.

awk -F'\t' '$1 ~ /^gat.*gat/' GATA3_peak_161win_with_motif_1_RSAT_dyad.txt | sort -k8,8nr
awk -F'\t' '$1 ~ /^atc.*atc/' GATA3_peak_161win_with_motif_1_RSAT_dyad.txt | sort -k8,8nr
## atcn{2}atc   atcn{2}atc|gatn{2}gat   0.0001743278582    3755  211.39 179 3934      17.76
## atcn{19}atc  atcn{19}atc|gatn{19}gat 0.0001834690131    1423  182.79 61  1484       7.78
## atcn{7}atc   atcn{7}atc|gatn{7}gat   0.0001966917141    1525  225.90 90  1615       6.75
## atcn{10}atc  atcn{10}atc|gatn{10}gat 0.0002081405868    1541  231.08 90  1631       6.67
## atcn{9}atc   atcn{9}atc|gatn{9}gat   0.0002062495205    1427  231.68 145 1572       6.16
## atcn{5}atc   atcn{5}atc|gatn{5}gat   0.0001401253129     970  164.40 81  1051       5.90
## atcn{12}atc  atcn{12}atc|gatn{12}gat 0.0001884293340    1200  204.49 69  1269       5.87
## atcn{20}atc  atcn{20}atc|gatn{20}gat 0.0001875565375    1068  184.47 62  1130       5.79
## atcn{8}atc   atcn{8}atc|gatn{8}gat   0.0001814393467    1159  205.94 53  1212       5.63
## atcn{15}atc  atcn{15}atc|gatn{15}gat 0.0002001648218    1151  209.60 86  1237       5.49
## atcn{16}atc  atcn{16}atc|gatn{16}gat 0.0001767432755     998  182.77 59  1057       5.46
## atcn{14}atc  atcn{14}atc|gatn{14}gat 0.0001838280257    1035  194.87 73  1108       5.31
## atcn{13}atc  atcn{13}atc|gatn{13}gat 0.0001850944034    1024  198.52 97  1121       5.16
## atcn{4}atc   atcn{4}atc|gatn{4}gat   0.0001842588721    1102  217.92 36  1138       5.06
## atcn{11}atc  atcn{11}atc|gatn{11}gat 0.0001856378756    1013  203.80 60  1073       4.97
## atcn{18}atc  atcn{18}atc|gatn{18}gat 0.0001950393881     960  196.77 52  1012       4.88
## atcn{17}atc  atcn{17}atc|gatn{17}gat 0.0001901779857     942  194.30 111 1053       4.85
## atcn{6}atc   atcn{6}atc|gatn{6}gat   0.0001786128314     991  207.10 39  1030       4.79
## atcn{0}atc   atcn{0}atc|gatn{0}gat   0.0003333380702    1643  414.73 41  1684       3.96
## atcn{3}atc   atcn{3}atc|gatn{3}gat   0.0001916566577     851  230.00 59  910    3.70
## atcn{1}atc   atcn{1}atc|gatn{1}gat   0.0001617727236     690  199.01 53  743    3.47

3.1 coherence check – single input sequence

The input .fasta file has 12470 sequences, the output .txt file identified 43680 patterns.

Why the output patterns are way larger than the input sequences?
If we only input one sequence, what will the output look like?

source ~/miniconda3/bin/activate
conda activate rsat
cat test_single_input.fasta
#>chr10:100072622-100072723
#CAGATTTTATCATTTATTTGCTCATGTATTCACTCACTCATTAGGTCatctatttagtcaaccaacatttacttaagtccttctctattcagagctctcag

rsat dyad-analysis -o test_single_input_RSAT_dyad.txt -i test_single_input.fasta -format FastA -l 3 -sp 0-20 -expfreq ENCODE.MCF7.DHS.background4.txt -return exp_occ,occ,ratio -sort -seqtype dna 

For this single sequence input, it generates 25662 patterns.

And if we look for cat-tca pattern, it identifies several cat-tca 3mer pair with different spacings. This is different than our customized analysis. In customized analysis, for each provided sequence, we only find one specific pattern that are closest to the peak summit. If I am looking for cat-tca pattern, then it will only give my one output.

#head test_single_input_RSAT_dyad.txt
awk -F'\t' '$1 ~ /^cat.*tca/' test_single_input_RSAT_dyad.txt | sort -k8,8nr
## catn{20}tca  catn{20}tca|tgan{20}atg 0.0005782191715       2    0.04 1   3     48.04
## catn{16}tca  catn{16}tca|tgan{16}atg 0.0005603828788       2    0.04 0   2     47.59
## catn{8}tca   catn{8}tca|tgan{8}atg   0.0005818472604       2    0.05 1   3     41.92
## catn{4}tca   catn{4}tca|tgan{4}atg   0.0005971999077       2    0.05 0   2     36.40
## catn{12}tca  catn{12}tca|tgan{12}atg 0.0005693193744       1    0.05 0   1     20.91
## catn{0}tca   catn{0}tca|tgan{0}atg   0.0006561197581       0    0.06 0   0      0.00
## catn{10}tca  catn{10}tca|tgan{10}atg 0.0005698178829       0    0.05 0   0      0.00
## catn{11}tca  catn{11}tca|tgan{11}atg 0.0005810435898       0    0.05 0   0      0.00
## catn{13}tca  catn{13}tca|tgan{13}atg 0.0005703437801       0    0.05 0   0      0.00
## catn{14}tca  catn{14}tca|tgan{14}atg 0.0006183306318       0    0.05 0   0      0.00
## catn{15}tca  catn{15}tca|tgan{15}atg 0.0005917583284       0    0.04 0   0      0.00
## catn{17}tca  catn{17}tca|tgan{17}atg 0.0005702219341       0    0.04 0   0      0.00
## catn{18}tca  catn{18}tca|tgan{18}atg 0.0005375782971       0    0.04 0   0      0.00
## catn{19}tca  catn{19}tca|tgan{19}atg 0.0005756638123       0    0.04 0   0      0.00
## catn{1}tca   catn{1}tca|tgan{1}atg   0.0006382247667       0    0.06 0   0      0.00
## catn{2}tca   catn{2}tca|tgan{2}atg   0.0005706900152       0    0.05 0   0      0.00
## catn{3}tca   catn{3}tca|tgan{3}atg   0.0007944812861       0    0.07 0   0      0.00
## catn{5}tca   catn{5}tca|tgan{5}atg   0.0005682468718       0    0.05 0   0      0.00
## catn{6}tca   catn{6}tca|tgan{6}atg   0.0005886556431       0    0.05 0   0      0.00
## catn{7}tca   catn{7}tca|tgan{7}atg   0.0006022431158       0    0.05 0   0      0.00
## catn{9}tca   catn{9}tca|tgan{9}atg   0.0005766835572       0    0.05 0   0      0.00

3.2 Extract the pattern info we need

library(Biostrings)
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
process_and_subset_RSAT <- function(file_path, pattern1, pattern2, number) {
  # Read the file
  input_data <- read.table(file_path, header = FALSE, sep = "\t")

  # Split the first column into components
  components_list <- strsplit(input_data$V1, "n\\{|\\}")

  processed_data <- data.frame(
    first = sapply(components_list, function(x) x[1]),
    dyad_distance = as.numeric(sapply(components_list, function(x) x[2])),
    second = sapply(components_list, function(x) x[3]),
    ratio = input_data$V8 # ratio
  )
  # patterns to DNAstring
  pattern1<-DNAString(pattern1)
  pattern2<-DNAString(pattern2)
  # Get the reverse complement of the patterns
  rc_pattern1 <- reverseComplement(pattern1)
  rc_pattern2 <- reverseComplement(pattern2)

  # Subset the dataframe based on the specified patterns and their reverse complements
  if (pattern1 == pattern2) {
    # Get the reverse complement of the patterns
    rc_pattern1 <- reverseComplement(pattern1)
    rc_pattern2 <- reverseComplement(pattern2)

    # Subset the dataframe based on the specified patterns and their reverse complements
    dyad_structure <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
    dyad_structure_rc <- processed_data[grepl(rc_pattern1, processed_data$first, ignore.case = TRUE) & grepl(rc_pattern2, processed_data$second, ignore.case = TRUE), ]
    
    output_data <- rbind(dyad_structure, dyad_structure_rc)
  } else {
    # If patterns are not the same, only subset based on pattern1 and pattern2
    output_data <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
  }
    
  # Order by descending ratio
  output_data <- output_data[order(-output_data$ratio), ]

  # Add the relative distance column
  output_data$relative_distance <- output_data$dyad_distance + number

  return(output_data)
}


GAT_GAT <- process_and_subset_RSAT("./GATA3_peak_161win_with_motif_1_RSAT_dyad.txt", "GAT", "GAT", 3)
GAT_ATC <- process_and_subset_RSAT("./GATA3_peak_161win_with_motif_1_RSAT_dyad.txt", "GAT", "ATC", 5)

nrow(GAT_GAT)
## [1] 21
head(GAT_GAT)
##      first dyad_distance second ratio relative_distance
## 28     atc             2    atc 17.76                 5
## 1603   atc            19    atc  7.78                22
## 1264   atc             7    atc  6.75                10
## 1219   atc            10    atc  6.67                13
## 1583   atc             9    atc  6.16                12
## 5986   atc             5    atc  5.90                 8
nrow(GAT_ATC)
## [1] 21
head(GAT_ATC)
##       first dyad_distance second  ratio relative_distance
## 1       gat             3    atc 145.17                 8
## 15239   gat             0    atc   9.57                 5
## 18183   gat             4    atc   9.14                 9
## 16303   gat            16    atc   6.04                21
## 19417   gat            15    atc   6.02                20
## 18091   gat            20    atc   5.90                25

The GAT_GAT will be the red trace, since the second GAT is on same strand relative to the first GAT; Similarly, the GAT_ATC will be the blue trace,as the second ATC is on opposite strand relative to the first GAT.

GAT_GAT$query_status="same_strand_GAT"
GAT_ATC$query_status="opposite_strand_GAT"
df.plot=rbind(GAT_GAT, GAT_ATC)
str(df.plot)
## 'data.frame':    42 obs. of  6 variables:
##  $ first            : chr  "atc" "atc" "atc" "atc" ...
##  $ dyad_distance    : num  2 19 7 10 9 5 12 20 8 15 ...
##  $ second           : chr  "atc" "atc" "atc" "atc" ...
##  $ ratio            : num  17.76 7.78 6.75 6.67 6.16 ...
##  $ relative_distance: num  5 22 10 13 12 8 15 23 11 18 ...
##  $ query_status     : chr  "same_strand_GAT" "same_strand_GAT" "same_strand_GAT" "same_strand_GAT" ...
unique(df.plot$query_status)
## [1] "same_strand_GAT"     "opposite_strand_GAT"
#[1] "same_strand_GAT"     "opposite_strand_GAT"
head(df.plot)
##      first dyad_distance second ratio relative_distance    query_status
## 28     atc             2    atc 17.76                 5 same_strand_GAT
## 1603   atc            19    atc  7.78                22 same_strand_GAT
## 1264   atc             7    atc  6.75                10 same_strand_GAT
## 1219   atc            10    atc  6.67                13 same_strand_GAT
## 1583   atc             9    atc  6.16                12 same_strand_GAT
## 5986   atc             5    atc  5.90                 8 same_strand_GAT
df.plot$query_status = factor(df.plot$query_status, levels = c("same_strand_GAT", "opposite_strand_GAT"))
nrow(df.plot)
## [1] 42
#42
summary(df.plot)
##     first           dyad_distance    second              ratio        
##  Length:42          Min.   : 0    Length:42          Min.   :  1.960  
##  Class :character   1st Qu.: 5    Class :character   1st Qu.:  4.890  
##  Mode  :character   Median :10    Mode  :character   Median :  5.310  
##                     Mean   :10                       Mean   :  8.977  
##                     3rd Qu.:15                       3rd Qu.:  5.900  
##                     Max.   :20                       Max.   :145.170  
##  relative_distance              query_status
##  Min.   : 3        same_strand_GAT    :21   
##  1st Qu.: 9        opposite_strand_GAT:21   
##  Median :14                                 
##  Mean   :14                                 
##  3rd Qu.:19                                 
##  Max.   :25

xyplot

library(lattice)
library(latticeExtra)
  xyplot(ratio ~ relative_distance,
         data = df.plot, 
         groups = query_status,
         #auto.key=TRUE,
         #auto.key = list(space = "right", lines=F, points=TRUE, cex = 1),
         auto.key=list(space="right", points=TRUE),
                       #title="2nd 3mer relative to the anchor", cex.title=1),
         aspect = 1,
         xlim=c(0,30),
         ylim=c(0, 200),
         #type = c('p', 'smooth'),
         xlab = "distance (bp) from 2nd closest GAT to closest GAT",
         ylab="RSAT obs/exp Ratio",
         main="GATA3 peak with motif1",
         between=list(y=1.0),
         scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
           
                                      panel.xyplot(x, y, 
                                                   col=c("orange","darkgreen"), 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      #panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
                                      
                                      
  })

Loop through RSAT results use the other positive controls, extract the relative dyad patterns and make a xy/barchart:

library(Biostrings)
library(lattice)
library(latticeExtra)

#function
process_and_subset_RSAT <- function(file_path, pattern1, pattern2, number) {
  # Read the file
  input_data <- read.table(file_path, header = FALSE, sep = "\t")

  # Split the first column into components
  components_list <- strsplit(input_data$V1, "n\\{|\\}")

  processed_data <- data.frame(
    first = sapply(components_list, function(x) x[1]),
    dyad_distance = as.numeric(sapply(components_list, function(x) x[2])),
    second = sapply(components_list, function(x) x[3]),
    ratio = input_data$V8 # ratio
  )
  # patterns to DNAstring
  pattern1<-DNAString(pattern1)
  pattern2<-DNAString(pattern2)
  # Get the reverse complement of the patterns
  rc_pattern1 <- reverseComplement(pattern1)
  rc_pattern2 <- reverseComplement(pattern2)

  # Subset the dataframe based on the specified patterns and their reverse complements
  if (pattern1 == pattern2) {
    # Get the reverse complement of the patterns
    rc_pattern1 <- reverseComplement(pattern1)
    rc_pattern2 <- reverseComplement(pattern2)

    # Subset the dataframe based on the specified patterns and their reverse complements
    dyad_structure <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
    dyad_structure_rc <- processed_data[grepl(rc_pattern1, processed_data$first, ignore.case = TRUE) & grepl(rc_pattern2, processed_data$second, ignore.case = TRUE), ]
    
    output_data <- rbind(dyad_structure, dyad_structure_rc)
  } else {
    # If patterns are not the same, only subset based on pattern1 and pattern2
    output_data <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
  }
    
  # Order by descending ratio
  output_data <- output_data[order(-output_data$ratio), ]

  # Add the relative distance column
  output_data$relative_distance <- output_data$dyad_distance + number

  return(output_data)
}

#loop through files
for (dyad.results in Sys.glob(file.path("./GATA3_peak_161win_with_motif_*_RSAT_dyad.txt")))  {
    print(dyad.results)
    motif.name = paste0("motif", strsplit((strsplit(strsplit(dyad.results, "/")[[1]][length(strsplit(dyad.results, "/")[[1]])], 'GATA3_peak_161win_with_motif_')[[1]][2]), "_RSAT_dyad.txt")[[1]][1])
    print(motif.name)
    GAT_GAT <- process_and_subset_RSAT(dyad.results, "GAT", "GAT", 3)
    GAT_ATC <- process_and_subset_RSAT(dyad.results, "GAT", "ATC", 5)
    nrow(GAT_GAT)
    head(GAT_GAT)
    nrow(GAT_ATC)
    head(GAT_ATC)
    GAT_GAT$query_status="same_strand_GAT"
    GAT_ATC$query_status="opposite_strand_GAT"
    df.plot=rbind(GAT_GAT, GAT_ATC)
    str(df.plot)

    unique(df.plot$query_status)
    #[1] "same_strand_GAT"     "opposite_strand_GAT"
    head(df.plot)
    df.plot$query_status = factor(df.plot$query_status, levels = c("same_strand_GAT", "opposite_strand_GAT"))
    nrow(df.plot)

    summary(df.plot)
    pdf(paste0('xy_RSAT_dyad_closest_2nd_GAT_to_closest_1st_GAT_GATA3_peak_with_', motif.name, '.pdf'), width=15,height=5)
print(
  xyplot(ratio ~ relative_distance,
         data = df.plot, 
         groups = query_status,
         #auto.key=TRUE,
         #auto.key = list(space = "right", lines=F, points=TRUE, cex = 1),
         auto.key=list(space="right", points=TRUE),
                       #title="2nd 3mer relative to the anchor", cex.title=1),
         aspect = 1,
         xlim=c(0,30),
         ylim=c(0, 200),
         #type = c('p', 'smooth'),
         xlab = "distance (bp) from 2nd closest GAT to closest GAT",
         ylab="RSAT obs/exp Ratio",
         main=paste0("GATA3 peak with ", motif.name),
         between=list(y=1.0),
         scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
           
                                      panel.xyplot(x, y, 
                                                   col=c("orange","darkgreen"), 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      #panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
                                      
                                      
  })
)
dev.off()
}

3.2.1 RSAT–motif5

In the previous analysis, we extracted dyad structures anchored at “GAT.” However, we have discussed the sensitivity of RSAT analysis to structures with two reverse-complement dyads. For motif5 analysis, we aim to focus on structures anchored specifically at “ATC” to ensure we identify the correct structure.

library(Biostrings)

process_and_subset_RSAT <- function(file_path, pattern1, pattern2, number) {
  # Read the file
  input_data <- read.table(file_path, header = FALSE, sep = "\t")

  # Split the first column into components
  components_list <- strsplit(input_data$V1, "n\\{|\\}")

  processed_data <- data.frame(
    first = sapply(components_list, function(x) x[1]),
    dyad_distance = as.numeric(sapply(components_list, function(x) x[2])),
    second = sapply(components_list, function(x) x[3]),
    ratio = input_data$V8 # ratio
  )
  # patterns to DNAstring
  pattern1<-DNAString(pattern1)
  pattern2<-DNAString(pattern2)
  # Get the reverse complement of the patterns
  rc_pattern1 <- reverseComplement(pattern1)
  rc_pattern2 <- reverseComplement(pattern2)

  # Subset the dataframe based on the specified patterns and their reverse complements
  if (pattern1 == pattern2) {
    # Get the reverse complement of the patterns
    rc_pattern1 <- reverseComplement(pattern1)
    rc_pattern2 <- reverseComplement(pattern2)

    # Subset the dataframe based on the specified patterns and their reverse complements
    dyad_structure <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
    dyad_structure_rc <- processed_data[grepl(rc_pattern1, processed_data$first, ignore.case = TRUE) & grepl(rc_pattern2, processed_data$second, ignore.case = TRUE), ]
    
    output_data <- rbind(dyad_structure, dyad_structure_rc)
  } else {
    # If patterns are not the same, only subset based on pattern1 and pattern2
    output_data <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
  }
    
  # Order by descending ratio
  output_data <- output_data[order(-output_data$ratio), ]

  # Add the relative distance column
  output_data$relative_distance <- output_data$dyad_distance + number

  return(output_data)
}


ATC_GAT <- process_and_subset_RSAT("./GATA3_peak_161win_with_motif_5_RSAT_dyad.txt", "ATC", "GAT", 1)
ATC_ATC <- process_and_subset_RSAT("./GATA3_peak_161win_with_motif_5_RSAT_dyad.txt", "ATC", "ATC", 3)

nrow(ATC_GAT)
## [1] 21
head(ATC_GAT)
##       first dyad_distance second  ratio relative_distance
## 3       atc             1    gat 271.11                 2
## 7076    atc             8    gat   8.94                 9
## 11672   atc            10    gat   8.03                11
## 12399   atc             6    gat   7.49                 7
## 14492   atc             7    gat   6.98                 8
## 12923   atc            14    gat   6.96                15
nrow(ATC_ATC)
## [1] 21
head(ATC_ATC)
##      first dyad_distance second ratio relative_distance
## 14     atc             2    atc 21.52                 5
## 342    atc             9    atc  8.47                12
## 849    atc             6    atc  7.87                 9
## 855    atc            10    atc  7.04                13
## 1116   atc             4    atc  6.87                 7
## 2937   atc            11    atc  5.81                14
ATC_ATC$query_status="same_strand_ATC"
ATC_GAT$query_status="opposite_strand_ATC"
df.plot=rbind(ATC_ATC, ATC_GAT)
str(df.plot)
## 'data.frame':    42 obs. of  6 variables:
##  $ first            : chr  "atc" "atc" "atc" "atc" ...
##  $ dyad_distance    : num  2 9 6 10 4 11 7 12 14 16 ...
##  $ second           : chr  "atc" "atc" "atc" "atc" ...
##  $ ratio            : num  21.52 8.47 7.87 7.04 6.87 ...
##  $ relative_distance: num  5 12 9 13 7 14 10 15 17 19 ...
##  $ query_status     : chr  "same_strand_ATC" "same_strand_ATC" "same_strand_ATC" "same_strand_ATC" ...
unique(df.plot$query_status)
## [1] "same_strand_ATC"     "opposite_strand_ATC"
head(df.plot)
##      first dyad_distance second ratio relative_distance    query_status
## 14     atc             2    atc 21.52                 5 same_strand_ATC
## 342    atc             9    atc  8.47                12 same_strand_ATC
## 849    atc             6    atc  7.87                 9 same_strand_ATC
## 855    atc            10    atc  7.04                13 same_strand_ATC
## 1116   atc             4    atc  6.87                 7 same_strand_ATC
## 2937   atc            11    atc  5.81                14 same_strand_ATC
df.plot$query_status = factor(df.plot$query_status, levels = c("same_strand_ATC", "opposite_strand_ATC"))
nrow(df.plot)
## [1] 42
#42
summary(df.plot)
##     first           dyad_distance    second              ratio        
##  Length:42          Min.   : 0    Length:42          Min.   :  1.850  
##  Class :character   1st Qu.: 5    Class :character   1st Qu.:  4.540  
##  Mode  :character   Median :10    Mode  :character   Median :  5.520  
##                     Mean   :10                       Mean   : 12.089  
##                     3rd Qu.:15                       3rd Qu.:  6.845  
##                     Max.   :20                       Max.   :271.110  
##  relative_distance              query_status
##  Min.   : 1        same_strand_ATC    :21   
##  1st Qu.: 7        opposite_strand_ATC:21   
##  Median :12                                 
##  Mean   :12                                 
##  3rd Qu.:17                                 
##  Max.   :23

xyplot

library(lattice)
library(latticeExtra)
#pdf(paste0('xy_RSAT_dyad_closest_2nd_ATC_to_closest_1st_ATC_GATA3_peak_with_motif5.pdf'), width=10,height=5)
#print(
  xyplot(ratio ~ relative_distance,
         data = df.plot, 
         groups = query_status,
         #auto.key=TRUE,
         #auto.key = list(space = "right", lines=F, points=TRUE, cex = 1),
         auto.key=list(space="right", points=TRUE),
                       #title="2nd 3mer relative to the anchor", cex.title=1),
         aspect = 1,
         xlim=c(0,30),
         ylim=c(0, 300),
         #type = c('p', 'smooth'),
         xlab = "distance (bp) from 2nd closest ATC to closest ATC",
         ylab="RSAT obs/exp Ratio",
         main="GATA3 peak with motif5",
         between=list(y=1.0),
         scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.symbol = list(col=c("#8B4513", "#145A8C"), pch=18, lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
           
                                      panel.xyplot(x, y, 
                                                   col=c("#8B4513", "#145A8C"), 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      #panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
                                      
                                      
  })

#)
#dev.off()

4 Comparing with Customized analysis

For the customized analysis:

  1. Merge the occurrences of the anchored GAT on both the plus and minus strands.

  2. Determine the second closest GAT relative to the strandedness of the anchored GAT. For example, if anchoring at GAT (regardless of its strand), the “minus-GAT” trace should be the sum of the occurrences of “+GAT” anchored at -GAT and “-GAT” anchored at +GAT.

  3. Calculate the relative frequencies (y-axis in plots) as the actual occurrences of the pattern minus the frequencies from the DHS negative control.

# define function
calculate_actual_frequency <- function(data) {
  # Use table() to create a frequency table
  actual_frequencies <- table(data)/length(data)
  result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
  return(result)
}


my_motifs = c("motif1", "motif4","motif2", "motif6", "motif5")
for (motif in my_motifs) {
  print(motif)
  #GATA peaks
  df.plot.GATA = data.frame(matrix(nrow = 0, ncol = 5))     
  colnames(df.plot.GATA) = c("dis","anchor_status", "query_status","abs.dis", "actual_freq")
  for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd*GAT.to.1st*GAT.GATA3.", motif, ".bed")))) {
    print(closest_2nd_dis)
    anchor_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.*.to.1st.')[[1]][2]), paste0(".GATA3.", motif, ".bed"))[[1]][1]
    print(anchor_status)
    query_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.')[[1]][2]), paste0(".to.1st.*.GATA3.", motif, ".bed"))[[1]][1]
    print(query_status)
    temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_status, query_status)) 
    colnames(temp) = c("dis", "anchor_status", "query_status")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    actual_frequencies = calculate_actual_frequency(temp$abs.dis)
    temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
    df.plot.GATA = rbind(df.plot.GATA,temp1)
  }
  df.plot.GATA$anchor_status = factor(df.plot.GATA$anchor_status, levels = c("plus.GAT", "minus.GAT"))
  df.plot.GATA$query_status = factor(df.plot.GATA$query_status, levels = c("plus.GAT", "minus.GAT"))
  uniq.df.plot.GATA=df.plot.GATA[!duplicated(df.plot.GATA), ]
  
  
  #DHS regions
  df.plot.DHS = data.frame(matrix(nrow = 0, ncol = 5))     
colnames(df.plot.DHS) = c("dis","anchor_status", "query_status","abs.dis", "actual_freq_neg")
for (closest_2nd_dis in Sys.glob(file.path("./closest.2nd*GAT.to.1st*GAT.indep.DHS.bed"))) {
    print(closest_2nd_dis)
    anchor_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.*.to.1st.')[[1]][2]), ".indep.DHS.bed")[[1]][1]
    print(anchor_status)
    query_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.')[[1]][2]), ".to.1st.*.indep.DHS.bed")[[1]][1]
    print(query_status)
    temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_status, query_status)) 
    colnames(temp) = c("dis", "anchor_status", "query_status")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    actual_frequency = calculate_actual_frequency(temp$abs.dis)
    temp1= merge(temp, actual_frequency, by.x = "abs.dis", by.y = "value", all.x = TRUE)
    df.plot.DHS = rbind(df.plot.DHS, temp1)
}
    colnames(df.plot.DHS)[5]="actual_freq_DHS"
    df.plot.DHS$anchor_status = factor(df.plot.DHS$anchor_status, levels = c("plus.GAT", "minus.GAT"))
    df.plot.DHS$query_status = factor(df.plot.DHS$query_status, levels = c("plus.GAT", "minus.GAT"))
    uniq.df.plot.DHS=df.plot.DHS[!duplicated(df.plot.DHS), ]
    #nrow(uniq.df.plot.DHS) #[1] 2859
  
  #calculate the relative frequency
    #by subtraction of actual frequency between GATA3 peaks and DHS regions
    df.plot=merge(uniq.df.plot.GATA, uniq.df.plot.DHS, by=c("abs.dis", "dis", "anchor_status", "query_status"), all.x = TRUE)
    df.plot$rel_freq <- ifelse(is.na(df.plot$actual_freq_DHS), NA, df.plot$actual_freq - df.plot$actual_freq_DHS)
    df.plot$strand_relationship <- ifelse(df.plot$anchor_status == df.plot$query_status,
                                              "same_strand_GAT", "opposite_strand_GAT")
    
    df.plot$strand_relationship = factor(df.plot$strand_relationship, levels = c("same_strand_GAT", "opposite_strand_GAT"))
    
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_2nd_GAT_to_closest_1st_GAT_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=15,height=5)
print(xyplot(rel_freq ~ abs.dis, 
         data = df.plot, 
         groups = strand_relationship,
         auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.3),
         #type = c('p', 'smooth'),
         xlab = "distance (bp) from 2nd closest GAT to closest GAT",
         ylab="Frequency relative to DHS regions",
         main=paste0("GATA3 peak with ", motif),
         between=list(y=1.0),
         scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.line = list(col=c("pink", "skyblue"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.densityplot(x, data = df.plot, 
                                                        from=0, 
                                                        to=50, 
                                                        lty = c(1),
                                                        lwd=2, 
                                                        darg=list(bw = "nrd0", kernel="gaussian"),
                                                        type = "count",
                                                        col=c("pink","skyblue"), ...)
                                      panel.xyplot(x, y, 
                                                   col=c("red","blue"), 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                      
  })
)
dev.off()

png(paste0('xy_closest_2nd_GAT_to_closest_1st_GAT_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
print(xyplot(rel_freq ~ abs.dis | anchor_status, 
         #data = df.plot[!duplicated(df.plot), ],
         data = df.plot, 
         groups = query_status,
         auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.3),
         #type = c('p', 'smooth'),
         xlab = "distance (bp) from 2nd closest GAT to closest GAT",
         ylab="Frequency relative to DHS regions",
         main=paste0("GATA3 peak with ", motif),
         between=list(y=1.0),
         scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.line = list(col=c("pink", "skyblue"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.densityplot(x, data = df.plot, 
                                                        from=0, 
                                                        to=50, 
                                                        lty = c(1),
                                                        lwd=2, 
                                                        darg=list(bw = "nrd0", kernel="gaussian"),
                                                        type = "count",
                                                        col=c("pink","skyblue"), ...)
                                      panel.xyplot(x, y, 
                                                   col=c("red","blue"), 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                      
  })
)
dev.off()
}
xy plot of GATA3 peak with motif1 relative to the DHS regions: 3mer structure

Figure 1: xy plot of GATA3 peak with motif1 relative to the DHS regions: 3mer structure

The plot generated using the above code has double points at the same distance with same strand info. This is because the plus.GAT-minus.GAT and minus.GAT-plus.GAT are both labeled as “opposite_strand_GAT” and the relative frequency is slightly different (although pretty close).

See the plotting data frame below:

abs.dis dis anchor_status query_status  actual_freq actual_freq_DHS
1       1   1      plus.GAT    minus.GAT 0.0008025682    0.0008209464
2       1   1     minus.GAT     plus.GAT 0.0009630046    0.0008557308
3       2   2      plus.GAT    minus.GAT 0.0119582665    0.0027248432
4       2   2     minus.GAT     plus.GAT 0.0127598106    0.0026021201
5       3   3      plus.GAT     plus.GAT 0.0300938929    0.0162763486
6       3   3      plus.GAT    minus.GAT 0.0760032103    0.1039283157
       rel_freq strand_relationship
1 -1.837814e-05 opposite_strand_GAT
2  1.072738e-04 opposite_strand_GAT
3  9.233423e-03 opposite_strand_GAT
4  1.015769e-02 opposite_strand_GAT
5  1.381754e-02     same_strand_GAT
6 -2.792511e-02 opposite_strand_GAT

This is not ideal. So I will modify the code: instead of calculating the actual frequency for each files, I will combine the file together, create the label relative to strand info, then calculate the actual frequency.

# define function
calculate_actual_frequency <- function(data) {
  # Use table() to create a frequency table
  actual_frequencies <- table(data)/length(data)
  result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
  return(result)
}


my_motifs = c("motif1", "motif4","motif2", "motif6", "motif5")
for (motif in my_motifs) {
  print(motif)
  #GATA peaks
  df.plot.GATA = data.frame(matrix(nrow = 0, ncol = 4))     
  colnames(df.plot.GATA) = c("dis","anchor_status", "query_status","abs.dis")
  for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd*GAT.to.1st*GAT.GATA3.", motif, ".bed")))) {
    print(closest_2nd_dis)
    anchor_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.*.to.1st.')[[1]][2]), paste0(".GATA3.", motif, ".bed"))[[1]][1]
    print(anchor_status)
    query_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.')[[1]][2]), paste0(".to.1st.*.GATA3.", motif, ".bed"))[[1]][1]
    print(query_status)
    temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_status, query_status)) 
    colnames(temp) = c("dis", "anchor_status", "query_status")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    df.plot.GATA = rbind(df.plot.GATA,temp)
  }
 
  df.plot.GATA$anchor_status = factor(df.plot.GATA$anchor_status, levels = c("plus.GAT", "minus.GAT"))
  df.plot.GATA$query_status = factor(df.plot.GATA$query_status, levels = c("plus.GAT", "minus.GAT"))
  df.plot.GATA$strand_relationship <- ifelse(df.plot.GATA$anchor_status == df.plot.GATA$query_status,
                                              "same_strand_GAT", "opposite_strand_GAT")
  df.plot.GATA$strand_relationship = factor(df.plot.GATA$strand_relationship, levels = c("same_strand_GAT", "opposite_strand_GAT"))
  
  temp.g1=df.plot.GATA[df.plot.GATA$strand_relationship=="same_strand_GAT",]
  actual_frequency_same_strand = calculate_actual_frequency(temp.g1$abs.dis)
  temp.g2=df.plot.GATA[df.plot.GATA$strand_relationship=="opposite_strand_GAT",]
  actual_frequency_oppo_strand = calculate_actual_frequency(temp.g2$abs.dis)
  
  df.plot.GATA1=rbind(merge(temp.g1, actual_frequency_same_strand, by.x = "abs.dis", by.y = "value", all.x = TRUE), merge(temp.g2, actual_frequency_oppo_strand, by.x = "abs.dis", by.y = "value", all.x = TRUE))
  uniq.df.plot.GATA=df.plot.GATA1[!duplicated(df.plot.GATA1), ]
  

  
  #DHS regions
  df.plot.DHS = data.frame(matrix(nrow = 0, ncol = 4))     
colnames(df.plot.DHS) = c("dis","anchor_status", "query_status","abs.dis")
for (closest_2nd_dis in Sys.glob(file.path("./closest.2nd*GAT.to.1st*GAT.indep.DHS.bed"))) {
    print(closest_2nd_dis)
    anchor_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.*.to.1st.')[[1]][2]), ".indep.DHS.bed")[[1]][1]
    print(anchor_status)
    query_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.')[[1]][2]), ".to.1st.*.indep.DHS.bed")[[1]][1]
    print(query_status)
    temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_status, query_status)) 
    colnames(temp) = c("dis", "anchor_status", "query_status")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    df.plot.DHS = rbind(df.plot.DHS, temp)
}

  df.plot.DHS$anchor_status = factor(df.plot.DHS$anchor_status, levels = c("plus.GAT", "minus.GAT"))
  df.plot.DHS$query_status = factor(df.plot.DHS$query_status, levels = c("plus.GAT", "minus.GAT"))
  df.plot.DHS$strand_relationship <- ifelse(df.plot.DHS$anchor_status == df.plot.DHS$query_status,
                                              "same_strand_GAT", "opposite_strand_GAT")
  df.plot.DHS$strand_relationship = factor(df.plot.DHS$strand_relationship, levels = c("same_strand_GAT", "opposite_strand_GAT"))
  
  temp.g1=df.plot.DHS[df.plot.DHS$strand_relationship=="same_strand_GAT",]
  actual_frequency_same_strand = calculate_actual_frequency(temp.g1$abs.dis)
  temp.g2=df.plot.DHS[df.plot.DHS$strand_relationship=="opposite_strand_GAT",]
  actual_frequency_oppo_strand = calculate_actual_frequency(temp.g2$abs.dis)
  
  df.plot.DHS1=rbind(merge(temp.g1, actual_frequency_same_strand, by.x = "abs.dis", by.y = "value", all.x = TRUE), merge(temp.g2, actual_frequency_oppo_strand, by.x = "abs.dis", by.y = "value", all.x = TRUE))
  uniq.df.plot.DHS=df.plot.DHS1[!duplicated(df.plot.DHS1), ] #nrow(uniq.df.plot.DHS) #[1] 2859
  colnames(uniq.df.plot.DHS)[6]="actual_freq_DHS"
  
  #calculate the relative frequency
    #by subtraction of actual frequency between GATA3 peaks and DHS regions
    df.plot1=merge(uniq.df.plot.GATA, uniq.df.plot.DHS, by=c("abs.dis", "dis", "anchor_status", "query_status", "strand_relationship"), all.x = TRUE)
    df.plot1$rel_freq <- ifelse(is.na(df.plot1$actual_freq_DHS), NA, df.plot1$actual_freq - df.plot1$actual_freq_DHS)
   
    df.plot=df.plot1[, c(1,2,5,6,7,8)] #1104
    df.plot=df.plot[!duplicated(df.plot), ] #658
    
    
library(lattice)
library(latticeExtra)
pdf(paste0('xy2_closest_2nd_GAT_to_closest_1st_GAT_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=15,height=5)
print(xyplot(rel_freq ~ abs.dis, 
         data = df.plot, 
         groups = strand_relationship,
         auto.key=list(space="right", points=TRUE),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.3),
         #type = c('p', 'smooth'),
         xlab = "distance (bp) from 2nd closest GAT to closest GAT",
         ylab="Frequency relative to DHS regions",
         main=paste0("GATA3 peak with ", motif),
         between=list(y=1.0),
         scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.symbol = list(col=c("red", "blue"), pch=18, lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
           
                                      panel.densityplot(x, data = df.plot, 
                                                        from=0, 
                                                        to=50, 
                                                        lty = c(1),
                                                        lwd=2, 
                                                        darg=list(bw = "nrd0", kernel="gaussian"),
                                                        type = "count",
                                                        col=c("pink","skyblue"), ...)
                                      panel.xyplot(x, y, 
                                                   col=c("red","blue"), 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      
                                      
  })
)
dev.off()


png(paste0('xy2_closest_2nd_GAT_to_closest_1st_GAT_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
print(xyplot(rel_freq ~ abs.dis, 
         data = df.plot, 
         groups = strand_relationship,
         auto.key=list(space="right", points=TRUE),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.3),
         #type = c('p', 'smooth'),
         xlab = "distance (bp) from 2nd closest GAT to closest GAT",
         ylab="Frequency relative to DHS regions",
         main=paste0("GATA3 peak with ", motif),
         between=list(y=1.0),
         scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.symbol = list(col=c("red", "blue"), pch=18, lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) { panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                     panel.densityplot(x, data = df.plot, 
                                                        from=0, 
                                                        to=50, 
                                                        lty = c(1),
                                                        lwd=2, 
                                                        darg=list(bw = "nrd0", kernel="gaussian"),
                                                        type = "count",
                                                        col=c("pink","skyblue"), ...)
                                      panel.xyplot(x, y, 
                                                   col=c("red","blue"), 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      
                                      
  })
)
dev.off()
}

Now the plotting data frame contains only single relative frequency value grouped by strand relationship.

 abs.dis dis strand_relationship  actual_freq actual_freq_DHS      rel_freq
1        1   1 opposite_strand_GAT 0.0008827896    0.0008383401  4.444953e-05
3        2   2 opposite_strand_GAT 0.0123590546    0.0026634763  9.695578e-03
5        3   3     same_strand_GAT 0.0286505357    0.0162428392  1.240770e-02
6        3   3 opposite_strand_GAT 0.0796516994    0.1027927204 -2.314102e-02
9        4   4     same_strand_GAT 0.0108743630    0.0088636999  2.010663e-03
10       4   4 opposite_strand_GAT 0.0085871353    0.0040607098  4.526426e-03
xy plot of GATA3 peak with motif1 relative to the DHS regions: 3mer structure

Figure 2: xy plot of GATA3 peak with motif1 relative to the DHS regions: 3mer structure

xy plot of GATA3 peak with motif2 relative to the DHS regions: 3mer structure

Figure 3: xy plot of GATA3 peak with motif2 relative to the DHS regions: 3mer structure

xy plot of GATA3 peak with motif4 relative to the DHS regions: 3mer structure

Figure 4: xy plot of GATA3 peak with motif4 relative to the DHS regions: 3mer structure

xy plot of GATA3 peak with motif5 relative to the DHS regions: 3mer structure

Figure 5: xy plot of GATA3 peak with motif5 relative to the DHS regions: 3mer structure

xy plot of GATA3 peak with motif6 relative to the DHS regions: 3mer structure

Figure 6: xy plot of GATA3 peak with motif6 relative to the DHS regions: 3mer structure

5 Automate analysis to go through the prioritized 3mers

In our previous analysis, we placed emphasis on certain 3-mers by assessing their enrichment in GATA3 peaks relative to DHS regions. This enrichment was determined by calculating the differences in cumulative distribution function (CDF) fractions at a specified “closed” distance (both 16bp and 20bp yield the same 3-mer cluster).

These prioritized 3mer includes “AAA” “TAA” “ATA” “TTA” “AAT” “TAT” “GAT” “ATT” “TTT” “ATC”.

Here, I want to
1) Extract patterns from RSAT analysis that related to these 3mer combination and compare with the above analysis.

  1. Automate customized analysis to go through the prioritized 3mers to measure the 2nd 3mer distance to the anchored 3mer.

First use GATA3 peaks with motif1 to do some test run, then perform the analysis to all 5 positive peak sets. Finally we want to apply to the peaks without motifs.

5.1 RSAT-dyad analysis – prioritized 3mer

5.1.1 Enrichment xy plot

Load the package, and the function to process the RSAT results file:

library(Biostrings)
process_and_subset_RSAT <- function(file_path, pattern1, pattern2, number) {
  # Read the file
  input_data <- read.table(file_path, header = FALSE, sep = "\t")

  # Split the first column into components
  components_list <- strsplit(input_data$V1, "n\\{|\\}")

  processed_data <- data.frame(
    first = sapply(components_list, function(x) x[1]),
    dyad_distance = as.numeric(sapply(components_list, function(x) x[2])),
    second = sapply(components_list, function(x) x[3]),
    ratio = input_data$V8 # ratio
  )
  # patterns to DNAstring
  pattern1<-DNAString(pattern1)
  pattern2<-DNAString(pattern2)
  # Get the reverse complement of the patterns
  rc_pattern1 <- reverseComplement(pattern1)
  rc_pattern2 <- reverseComplement(pattern2)

  # Subset the dataframe based on the specified patterns and their reverse complements
  if (pattern1 == pattern2) {
    # Get the reverse complement of the patterns
    rc_pattern1 <- reverseComplement(pattern1)
    rc_pattern2 <- reverseComplement(pattern2)

    # Subset the dataframe based on the specified patterns and their reverse complements
    dyad_structure <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
    dyad_structure_rc <- processed_data[grepl(rc_pattern1, processed_data$first, ignore.case = TRUE) & grepl(rc_pattern2, processed_data$second, ignore.case = TRUE), ]
    
    output_data <- rbind(dyad_structure, dyad_structure_rc)
  } else {
    # If patterns are not the same, only subset based on pattern1 and pattern2
    output_data <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
  }
    
  # Order by descending ratio
  output_data <- output_data[order(-output_data$ratio), ]

  # Add the relative distance column
  output_data$relative_distance <- output_data$dyad_distance + number

  return(output_data)
}

In the provided function, pattern1 represents the anchored dyad, pattern2 denotes the other half dyad, and number signifies the user-defined distance (representing the relative separation between two zinc fingers, with “G” serving as the single resolution) when there is no spacing between each dyad.

For AAT and ATT, there is two possible relative “G/C”. One is (gat)AAT/ATT(atc), one is AAT(c)/(g)ATT.
I have examined the Information Content (IC) in all Sequence logos of the Information Content Matrix, and examined the letter-probability matrix to determine which configiration is more common/more likely to observe.
Overall, AAT(c)/(g)ATT have higher IC compared to (gat)AAT/ATT(atc).
I will use AAT(c)/(g)ATT to anchor at the ‘G/C’ base for measuring the relative distance between the two zinc fingers.

I’ve created a file that simplifies the computation of the ‘number’ for any given pair of dyads. It operates by adding the corresponding numbers associated with the anchor (pattern1) or query (pattern2) pattern.

#pattern_anchor_at_G_compute_dis.csv
read.csv('pattern_anchor_at_G_compute_dis.csv')
##    pattern identifier anchor query
## 1      GAT        GAT      3     0
## 2      ATC        ATC      1     2
## 3      ATA     (g)ATA      4    -1
## 4      TAT     TAT(c)      0     3
## 5      TTA    TTA(tc)     -1     4
## 6      TAA    (ga)TAA      5    -2
## 7      AAT     AAT(c)      0     3
## 8      ATT     (g)ATT      4    -1
## 9      AAA   (gat)AAA      6    -3
## 10     TTT   TTT(atc)     -2     5
## 11     AGA     AGA(t)      2     1
## 12     TCT     (a)TCT      2     1
## 13     TAG    (ga)TAG      5    -2
## 14     CTA    CTA(tc)     -1     4
library(lattice)
library(latticeExtra)
library(Biostrings)

my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC")
Query_dyad =c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC")

compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')

for (motif in my_motifs) {
  print(motif)
  rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
  for (pattern1 in Anchor_dyad){
    print(pattern1)
    for (pattern2 in Query_dyad){
      print(pattern2)
      number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
      rc_pattern2=reverseComplement(DNAString(pattern2))
      print(rc_pattern2)
      number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
        
      ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
      os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
      
      if(nrow(ss) > 0) {
                        ss$query_status <- paste0("same_strand_", pattern2)
                        } else {
                        ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
                        colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        ss$query_status <- paste0("same_strand_", pattern2)
                        }

      if(nrow(os) > 0) {
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        } else {
                        os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
                        colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        }
      
      df.plot=rbind(ss, os)
      df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
      
      # xy plot
      pdf(paste0('test_xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=10,height=6)
      print(xyplot(ratio ~ relative_distance,
                   data = df.plot, 
                   groups = query_status,
                   auto.key=list(space="right", points=TRUE),
                   aspect = 1,
                   xlim=c(0,30),
                   ylim=c(0, 500),
                   xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
                   ylab="RSAT obs/exp Ratio",
                   main=paste0("GATA3 peak with ", motif),
                   between=list(y=1.0),
                   scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
                   par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
                   panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                              panel.xyplot(x, y, 
                                                           col=c("orange","darkgreen"), 
                                                           pch=18, 
                                                           cex=0.6,...)
                                              #panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
                                      
                                      
                 }))
     dev.off()
    }  
  } 
}

test:

library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1")
Anchor_dyad = c("GAT")
Query_dyad =c("GAT")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')

for (motif in my_motifs) {
  print(motif)
  rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
  for (pattern1 in Anchor_dyad){
    print(pattern1)
    for (pattern2 in Query_dyad){
      print(pattern2)
      number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
      rc_pattern2=reverseComplement(DNAString(pattern2))
      print(rc_pattern2)
      number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
        
      ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
      os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
      
      if(nrow(ss) > 0) {
                        ss$query_status <- paste0("same_strand_", pattern2)
                        } else {
                        ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
                        colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        ss$query_status <- paste0("same_strand_", pattern2)
                        }

      if(nrow(os) > 0) {
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        } else {
                        os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
                        colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        }
      
      df.plot=rbind(ss, os)
      df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
      
      # xy plot
      pdf(paste0('test_xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=10,height=6)
      print(xyplot(ratio ~ relative_distance,
                   data = df.plot, 
                   groups = query_status,
                   auto.key=list(space="right", points=TRUE),
                   aspect = 1,
                   xlim=c(0,30),
                   ylim=c(0, 500),
                   xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
                   ylab="RSAT obs/exp Ratio",
                    main=paste0("GATA3 peak with ", motif),
                   between=list(y=1.0),
                   scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
                   par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
                   panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                              panel.xyplot(x, y, 
                                                           col=c("orange","darkgreen"), 
                                                           pch=18, 
                                                           cex=0.6,...)
                                              #panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
                                      
                                      
                 }))
     dev.off()
    }  
  } 
}

test2:

library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1")
Anchor_dyad = c("TAA")
Query_dyad =c("AAA")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')

for (motif in my_motifs) {
  print(motif)
  rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
  for (pattern1 in Anchor_dyad){
    print(pattern1)
    for (pattern2 in Query_dyad){
      print(pattern2)
      number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
      rc_pattern2=reverseComplement(DNAString(pattern2))
      print(rc_pattern2)
      number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
        
      ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
      os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
      
      if(nrow(ss) > 0) {
                        ss$query_status <- paste0("same_strand_", pattern2)
                        } else {
                        ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
                        colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        ss$query_status <- paste0("same_strand_", pattern2)
                        }

      if(nrow(os) > 0) {
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        } else {
                        os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
                        colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        }
      
      df.plot=rbind(ss, os)
      df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
      
      # xy plot
      pdf(paste0('test_xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=10,height=6)
      print(xyplot(ratio ~ relative_distance,
                   data = df.plot, 
                   groups = query_status,
                   auto.key=list(space="right", points=TRUE),
                   aspect = 1,
                   xlim=c(0,30),
                   ylim=c(0, 500),
                   xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
                   ylab="RSAT obs/exp Ratio",
                    main=paste0("GATA3 peak with ", motif),
                   between=list(y=1.0),
                   scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
                   par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
                   panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                              panel.xyplot(x, y, 
                                                           col=c("orange","darkgreen"), 
                                                           pch=18, 
                                                           cex=0.6,...)
                                              #panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
                                      
                                      
                 }))
     dev.off()
    }  
  } 
}

Anchor at one of the top 10 prioritized 3-mers, then query the remaining 10 3-mers on either the same strand or the opposite strand. This generates a 10x10=100 enrichment graph for each of the five peak data sets, resulting in a total of 500 enrichment graphs.

5.1.2 Ranking dyad-structure

Another method to identify the most enriched structure within each peak dataset is by ranking the files (from the 5 peak sets) based on the observed-to-expected (obs/exp) ratio. This allows us to determine which structural motif is enriched, followed by a search for the corresponding enrichment graph.

Files generated:
GATA3_peak_161win_with_motif_1_RSAT_dyad.txt
GATA3_peak_161win_with_motif_2_RSAT_dyad.txt
GATA3_peak_161win_with_motif_4_RSAT_dyad.txt
GATA3_peak_161win_with_motif_5_RSAT_dyad.txt
GATA3_peak_161win_with_motif_6_RSAT_dyad.txt

GATA3 peak with motif1

cat GATA3_peak_161win_with_motif_1_RSAT_dyad.txt | sort -k8,8nr | head -3
## agan{0}taa   agan{0}taa|ttan{0}tct   0.0000096090146    5168   11.96 10  5178     432.28
## gatn{3}atc   gatn{3}atc|gatn{3}atc   0.0000716460535   12482   85.98 36  12518    145.17
## atan{2}atc   atan{2}atc|gatn{2}tat   0.0001616794646   10453  196.05 1460    11913     53.32

GATA3 peak with motif2

cat GATA3_peak_161win_with_motif_2_RSAT_dyad.txt | sort -k8,8nr | head -3
## agan{0}taa   agan{0}taa|ttan{0}tct   0.0000096090146    5367   11.05 15  5382     485.84
## gatn{4}atc   gatn{4}atc|gatn{4}atc   0.0000504302144   10109   55.31 57  10166    182.76
## atan{3}atc   atan{3}atc|gatn{3}tat   0.0001614461361    8061  178.78 812 8873      45.09

GATA3 peak with motif4

cat GATA3_peak_161win_with_motif_4_RSAT_dyad.txt | sort -k8,8nr | head -8
## agan{0}taa   agan{0}taa|ttan{0}tct   0.0000096090146    2635    6.31 0   2635     417.35
## atcn{5}atc   atcn{5}atc|gatn{5}gat   0.0001401253129    8136   86.45 3041    11177     94.11
## tagn{0}ata   tagn{0}ata|tatn{0}cta   0.0001101653224    4974   72.39 2520    7494      68.72
## gatn{6}ata   gatn{6}ata|tatn{6}atc   0.0001922430633    5856  117.34 2756    8612      49.91
## atan{0}gat   atan{0}gat|atcn{0}tat   0.0001759397041    5311  115.60 2653    7964      45.94
## atan{4}gat   atan{4}gat|atcn{4}tat   0.0001505921639    4145   93.97 2862    7007      44.11
## gatn{2}ata   gatn{2}ata|tatn{2}atc   0.0001517807219    4287   97.37 2333    6620      44.03
## atcn{1}atc   atcn{1}atc|gatn{1}gat   0.0001617727236    4361  105.00 2663    7024      41.53

GATA3 peak with motif5

cat GATA3_peak_161win_with_motif_5_RSAT_dyad.txt | sort -k8,8nr | head -10
## agan{0}taa   agan{0}taa|ttan{0}tct   0.0000096090146    1460    4.15 0   1460     351.92
## atcn{1}gat   atcn{1}gat|atcn{1}gat   0.0000365557250    4232   15.61 14  4246     271.11
## atcn{2}ata   atcn{2}ata|tatn{2}gat   0.0001323956841    3042   55.76 429 3471      54.55
## gatn{0}aac   gatn{0}aac|gttn{0}atc   0.0000457443231     612   19.75 0   612   30.99
## ctgn{0}ata   ctgn{0}ata|tatn{0}cag   0.0001502254396    1839   64.86 5   1844      28.35
## atcn{0}tga   atcn{0}tga|tcan{0}gat   0.0004057440254    4520  175.18 50  4570      25.80
## gatn{0}aaa   gatn{0}aaa|tttn{0}atc   0.0001042104402    1134   44.99 0   1134      25.20
## atcn{0}aga   atcn{0}aga|tctn{0}gat   0.0004149470253    4458  179.15 40  4498      24.88
## atcn{3}taa   atcn{3}taa|ttan{3}gat   0.0001898551128    1853   79.23 91  1944      23.39
## aatn{2}gat   aatn{2}gat|atcn{2}att   0.0002558550032    2514  107.76 354 2868      23.33
awk -F'\t' '$1 ~ /^gat.*gat/' GATA3_peak_161win_with_motif_5_RSAT_dyad.txt | head
awk -F'\t' '$1 ~ /^atc.*atc/' GATA3_peak_161win_with_motif_5_RSAT_dyad.txt | head
echo ""
awk -F'\t' '$1 ~ /^gat.*atc/' GATA3_peak_161win_with_motif_5_RSAT_dyad.txt | head

echo ""
awk -F'\t' '$1 ~ /^atc.*gat/' GATA3_peak_161win_with_motif_5_RSAT_dyad.txt | head
## atcn{2}atc   atcn{2}atc|gatn{2}gat   0.0001743278582    1580   73.43 190 1770      21.52
## atcn{9}atc   atcn{9}atc|gatn{9}gat   0.0002062495205     684   80.71 39  723    8.47
## atcn{6}atc   atcn{6}atc|gatn{6}gat   0.0001786128314     568   72.21 29  597    7.87
## atcn{10}atc  atcn{10}atc|gatn{10}gat 0.0002081405868     567   80.51 34  601    7.04
## atcn{4}atc   atcn{4}atc|gatn{4}gat   0.0001842588721     523   76.09 37  560    6.87
## atcn{7}atc   atcn{7}atc|gatn{7}gat   0.0001966917141     445   78.65 32  477    5.66
## atcn{11}atc  atcn{11}atc|gatn{11}gat 0.0001856378756     413   71.03 26  439    5.81
## atcn{12}atc  atcn{12}atc|gatn{12}gat 0.0001884293340     393   71.25 26  419    5.52
## atcn{15}atc  atcn{15}atc|gatn{15}gat 0.0002001648218     389   73.16 18  407    5.32
## atcn{14}atc  atcn{14}atc|gatn{14}gat 0.0001838280257     375   67.98 56  431    5.52
## 
## gatn{5}atc   gatn{5}atc|gatn{5}atc   0.0000883000715     329   36.10 1   330    9.11
## gatn{3}atc   gatn{3}atc|gatn{3}atc   0.0000716460535     239   29.90 6   245    7.99
## gatn{0}atc   gatn{0}atc|gatn{0}atc   0.0000515638671     228   22.26 0   228   10.24
## gatn{8}atc   gatn{8}atc|gatn{8}atc   0.0000899257860     216   35.58 2   218    6.07
## gatn{11}atc  gatn{11}atc|gatn{11}atc 0.0000932630475     214   35.68 1   215    6.00
## gatn{6}atc   gatn{6}atc|gatn{6}atc   0.0000971154028     213   39.26 2   215    5.43
## gatn{10}atc  gatn{10}atc|gatn{10}atc 0.0000987860472     209   38.21 0   209    5.47
## gatn{2}atc   gatn{2}atc|gatn{2}atc   0.0000878513418     208   37.00 0   208    5.62
## gatn{7}atc   gatn{7}atc|gatn{7}atc   0.0000847463572     192   33.89 0   192    5.67
## gatn{12}atc  gatn{12}atc|gatn{12}atc 0.0000888395037     186   33.59 3   189    5.54
## 
## atcn{1}gat   atcn{1}gat|atcn{1}gat   0.0000365557250    4232   15.61 14  4246     271.11
## atcn{8}gat   atcn{8}gat|atcn{8}gat   0.0000890597270     315   35.23 5   320    8.94
## atcn{10}gat  atcn{10}gat|atcn{10}gat 0.0000804379701     250   31.11 3   253    8.03
## atcn{6}gat   atcn{6}gat|atcn{6}gat   0.0000802195944     243   32.43 3   246    7.49
## atcn{14}gat  atcn{14}gat|atcn{14}gat 0.0000920659368     237   34.05 3   240    6.96
## atcn{13}gat  atcn{13}gat|atcn{13}gat 0.0000915682647     232   34.24 2   234    6.77
## atcn{7}gat   atcn{7}gat|atcn{7}gat   0.0000795928625     222   31.83 4   226    6.98
## atcn{11}gat  atcn{11}gat|atcn{11}gat 0.0000917826817     219   35.12 4   223    6.24
## atcn{18}gat  atcn{18}gat|atcn{18}gat 0.0000928758991     201   32.74 1   202    6.14
## atcn{15}gat  atcn{15}gat|atcn{15}gat 0.0000847558549     185   30.98 1   186    5.97

GATA3 peak with motif6

cat GATA3_peak_161win_with_motif_6_RSAT_dyad.txt | sort -k8,8nr | head -3
## agan{0}taa   agan{0}taa|ttan{0}tct   0.0000096090146    2445    5.38 2   2447     454.40
## atcn{6}atc   atcn{6}atc|gatn{6}gat   0.0001786128314    3871   93.56 216 4087      41.37
## gatn{7}ata   gatn{7}ata|tatn{7}atc   0.0001561795197    3011   80.92 119 3130      37.21

In the above 5 GATA3 peak data sets, it seems that AGA-TAA, GAT-ATC, ATA-ATC, ATC-ATC, TAG-ATA, ATC-GAT, ATC-ATA and GAT-ATA are enriched in GATA3 peaks contains GATA3 motifs.

5.1.2.1 AGA-TAA

Unfortunately, AGA is not included in our prioritized list, but it is ranked just after TTT/AAA. Therefore, we can include an analysis of this 3-mer here.
And in our customized analysis, we might also include AGA/TCT to our prioritized 3mer list.

library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("AGA")
Query_dyad =c("TAA")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')

for (motif in my_motifs) {
  print(motif)
  rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
  for (pattern1 in Anchor_dyad){
    print(pattern1)
    for (pattern2 in Query_dyad){
      print(pattern2)
      number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
      rc_pattern2=reverseComplement(DNAString(pattern2))
      print(rc_pattern2)
      number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
        
      ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
      os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
      
      if(nrow(ss) > 0) {
                        ss$query_status <- paste0("same_strand_", pattern2)
                        } else {
                        ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
                        colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        ss$query_status <- paste0("same_strand_", pattern2)
                        }

      if(nrow(os) > 0) {
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        } else {
                        os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
                        colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        }
      
      df.plot=rbind(ss, os)
      df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
      
      # xy plot
      pdf(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=10,height=6)
      print(xyplot(ratio ~ relative_distance,
                   data = df.plot, 
                   groups = query_status,
                   auto.key=list(space="right", points=TRUE),
                   aspect = 1,
                   xlim=c(-1,30),
                   ylim=c(0, 500),
                   xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
                   ylab="RSAT obs/exp Ratio",
                    main=paste0("GATA3 peak with ", motif),
                   between=list(y=1.0),
                   scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
                   par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
                   panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                              panel.xyplot(x, y, 
                                                           col=c("orange","darkgreen"), 
                                                           pch=18, 
                                                           cex=0.6,...)
                                              #panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
                                      
                                      
                 }))
     dev.off()
    }  
  } 
}
## [1] "motif_1"
## [1] "AGA"
## [1] "TAA"
## 3-letter DNAString object
## seq: TTA
## [1] "motif_4"
## [1] "AGA"
## [1] "TAA"
## 3-letter DNAString object
## seq: TTA
## [1] "motif_2"
## [1] "AGA"
## [1] "TAA"
## 3-letter DNAString object
## seq: TTA
## [1] "motif_6"
## [1] "AGA"
## [1] "TAA"
## 3-letter DNAString object
## seq: TTA
## [1] "motif_5"
## [1] "AGA"
## [1] "TAA"
## 3-letter DNAString object
## seq: TTA
xy plot of rsat analysis for GATA3 peak with motif4: 3mer structure

Figure 7: xy plot of rsat analysis for GATA3 peak with motif4: 3mer structure

All enrichment graphs show enrichment of a dyad structure with AGA 6bp distal from the opposite strand TAA, which is a [AGA][TTA]TC. With a relative distance from G to C being 6bp.

We did not observe enrichment at a relative distance of 0 between AGA and TAA on the same strand. This is because a relative distance of 0 between two zinc fingers implies that only one zinc finger is binding in that location, indicating the absence of a dyad structure. Similarly, when we did not observe enrichment for the AGATAA structure in our graph, it’s because AGATAA is considered a single binding site, rather than two separate motifs.

5.1.2.2 GAT-ATC

library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("GAT")
Query_dyad =c("ATC")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')

for (motif in my_motifs) {
  print(motif)
  rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
  for (pattern1 in Anchor_dyad){
    print(pattern1)
    for (pattern2 in Query_dyad){
      print(pattern2)
      number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
      rc_pattern2=reverseComplement(DNAString(pattern2))
      print(rc_pattern2)
      number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
        
      ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
      os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
      
      if(nrow(ss) > 0) {
                        ss$query_status <- paste0("same_strand_", pattern2)
                        } else {
                        ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
                        colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        ss$query_status <- paste0("same_strand_", pattern2)
                        }

      if(nrow(os) > 0) {
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        } else {
                        os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
                        colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        }
      
      df.plot=rbind(ss, os)
      df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
      
      # xy plot
      png(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
      print(xyplot(ratio ~ relative_distance,
                   data = df.plot, 
                   groups = query_status,
                   auto.key=list(space="right", points=TRUE),
                   aspect = 1,
                   xlim=c(-1,30),
                   ylim=c(0, 500),
                   xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
                   ylab="RSAT obs/exp Ratio",
                    main=paste0("GATA3 peak with ", motif),
                   between=list(y=1.0),
                   scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
                   par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
                   panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                              panel.xyplot(x, y, 
                                                           col=c("orange","darkgreen"), 
                                                           pch=18, 
                                                           cex=0.6,...)
                                              #panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
                                      
                                      
                 }))
     dev.off()
    }  
  } 
}
## [1] "motif_1"
## [1] "GAT"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_4"
## [1] "GAT"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_2"
## [1] "GAT"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_6"
## [1] "GAT"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_5"
## [1] "GAT"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
GATA3 peak with motif1
xy plot of rsat analysis for GATA3 peak with motif1: 3mer structure

Figure 8: xy plot of rsat analysis for GATA3 peak with motif1: 3mer structure

For GATA3 peaks with motif1, we have seen an enriched dyad structure of a GAT with 8bp relative distance to its same strand ATC.
This is same as the RSAT defined GATn{3}ATC which rank as second highest ratio. And also match the motif1 PWMs/seqlogo.

GATA3 peak with motif2

xy plot of rsat analysis for GATA3 peak with motif2: 3mer structure

Figure 9: xy plot of rsat analysis for GATA3 peak with motif2: 3mer structure

For GATA3 peaks with motif2, we have seen an enriched dyad structure of a GAT with 9bp relative distance to its same strand ATC.
This is same as the RSAT defined GATn{4}ATC which rank as second highest ratio. And also match the motif2 PWMs/seqlogo.

GATA3 peak with motif4
xy plot of rsat analysis for GATA3 peak with motif4: 3mer structure

Figure 10: xy plot of rsat analysis for GATA3 peak with motif4: 3mer structure

For GATA3 peaks with motif4, we have seen an enriched dyad structure of a GAT with 8bp relative distance to its opposite strand ATC (same as the same strand GAT).
This is same as the RSAT defined atcn{5}atc/gatn{5}gat which rank as second highest ratio. And also match the motif4 PWMs/seqlogo.

GATA3 peak with motif6
xy plot of rsat analysis for GATA3 peak with motif6: 3mer structure

Figure 11: xy plot of rsat analysis for GATA3 peak with motif6: 3mer structure

For GATA3 peaks with motif6, we have seen an enriched dyad structure of a GAT with 9bp relative distance to its opposite strand ATC.
This is same as the RSAT defined atcn{6}atc|gatn{6}gat which rank as second highest ratio. And also match the motif2 PWMs/seqlogo.

GATA3 peak with motif5
xy plot of rsat analysis for GATA3 peak with motif5: 3mer structure

Figure 12: xy plot of rsat analysis for GATA3 peak with motif5: 3mer structure

For GATA3 peaks with motif5, we did not see a very enriched dyad structure for GAT relative to +/- ATC. The relative distance at 5bp may or may not be enriched (afterall the ratio seems to be below 25). This is expected, because motif5 has a specific structure of ATCn{1}GAT, which ranked as 2nd highest ratio in the rsat results. This structure is different than GAT relative to +/- ATC.

5.1.2.3 ATC-GAT

library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("ATC")
Query_dyad =c("GAT")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')

for (motif in my_motifs) {
  print(motif)
  rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
  for (pattern1 in Anchor_dyad){
    print(pattern1)
    for (pattern2 in Query_dyad){
      print(pattern2)
      number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
      rc_pattern2=reverseComplement(DNAString(pattern2))
      print(rc_pattern2)
      number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
        
      ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
      os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
      
      if(nrow(ss) > 0) {
                        ss$query_status <- paste0("same_strand_", pattern2)
                        } else {
                        ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
                        colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        ss$query_status <- paste0("same_strand_", pattern2)
                        }

      if(nrow(os) > 0) {
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        } else {
                        os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
                        colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        }
      
      df.plot=rbind(ss, os)
      df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
      
      # xy plot
      png(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
      print(xyplot(ratio ~ relative_distance,
                   data = df.plot, 
                   groups = query_status,
                   auto.key=list(space="right", points=TRUE),
                   aspect = 1,
                   xlim=c(-1,30),
                   ylim=c(0, 500),
                   xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
                   ylab="RSAT obs/exp Ratio",
                    main=paste0("GATA3 peak with ", motif),
                   between=list(y=1.0),
                   scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
                   par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
                   panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                              panel.xyplot(x, y, 
                                                           col=c("orange","darkgreen"), 
                                                           pch=18, 
                                                           cex=0.6,...)
                                              #panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
                                      
                                      
                 }))
     dev.off()
    }  
  } 
}
## [1] "motif_1"
## [1] "ATC"
## [1] "GAT"
## 3-letter DNAString object
## seq: ATC
## [1] "motif_4"
## [1] "ATC"
## [1] "GAT"
## 3-letter DNAString object
## seq: ATC
## [1] "motif_2"
## [1] "ATC"
## [1] "GAT"
## 3-letter DNAString object
## seq: ATC
## [1] "motif_6"
## [1] "ATC"
## [1] "GAT"
## 3-letter DNAString object
## seq: ATC
## [1] "motif_5"
## [1] "ATC"
## [1] "GAT"
## 3-letter DNAString object
## seq: ATC
GATA3 peak with motif5
xy plot of rsat analysis for GATA3 peak with motif6: 3mer structure

Figure 13: xy plot of rsat analysis for GATA3 peak with motif6: 3mer structure

As previous mentioned, now we see an enriched dyad structure of a ATC with 2bp relative distance to its opposite strand GAT, for peaks with motif5.

This is same as the RSAT defined atcn{1}gat which rank as second highest ratio. And also match the motif5 PWMs/seqlogo.

5.1.2.4 ATA-ATC

This structure is only enriched in GATA3 peaks with motif1 and motif2 according to RSAT results.

library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("ATA")
Query_dyad =c("ATC")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')

for (motif in my_motifs) {
  print(motif)
  rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
  for (pattern1 in Anchor_dyad){
    print(pattern1)
    for (pattern2 in Query_dyad){
      print(pattern2)
      number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
      rc_pattern2=reverseComplement(DNAString(pattern2))
      print(rc_pattern2)
      number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
        
      ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
      os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
      
      if(nrow(ss) > 0) {
                        ss$query_status <- paste0("same_strand_", pattern2)
                        } else {
                        ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
                        colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        ss$query_status <- paste0("same_strand_", pattern2)
                        }

      if(nrow(os) > 0) {
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        } else {
                        os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
                        colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        }
      
      df.plot=rbind(ss, os)
      df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
      
      # xy plot
      png(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
      print(xyplot(ratio ~ relative_distance,
                   data = df.plot, 
                   groups = query_status,
                   auto.key=list(space="right", points=TRUE),
                   aspect = 1,
                   xlim=c(-1,30),
                   ylim=c(0, 500),
                   xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
                   ylab="RSAT obs/exp Ratio",
                    main=paste0("GATA3 peak with ", motif),
                   between=list(y=1.0),
                   scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
                   par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
                   panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                              panel.xyplot(x, y, 
                                                           col=c("orange","darkgreen"), 
                                                           pch=18, 
                                                           cex=0.6,...)
                                              #panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
                                      
                                      
                 }))
     dev.off()
    }  
  } 
}
## [1] "motif_1"
## [1] "ATA"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_4"
## [1] "ATA"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_2"
## [1] "ATA"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_6"
## [1] "ATA"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_5"
## [1] "ATA"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 14: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 15: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 16: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 17: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 18: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

It is clear in the enrichment graph that GATA3 peaks with motif1 has an enriched dyad structure of ATAn{2}ATC (relative distance of two zinc fingers is 8bp); GATA3 peaks with motif2 has an enriched dyad structure of ATAn{3}ATC (relative distance of two zinc finger is 9bp).
For GATA3 peaks with motif4, ATAn{0}GAT and ATA{4}GAT are enriched. The relative distances of two zinc fingers are at 4bp and 8bp.
No remarkable enrichment of dyad structure related to ATA and ATC for GATA3 peaks with motif5 and motif6.

5.1.2.5 ATC-ATC

Notice that this is anchoring at ATC and looking for ATC on same strand or opposite strand.

library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("ATC")
Query_dyad =c("ATC")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')

for (motif in my_motifs) {
  print(motif)
  rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
  for (pattern1 in Anchor_dyad){
    print(pattern1)
    for (pattern2 in Query_dyad){
      print(pattern2)
      number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
      rc_pattern2=reverseComplement(DNAString(pattern2))
      print(rc_pattern2)
      number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
        
      ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
      os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
      
      if(nrow(ss) > 0) {
                        ss$query_status <- paste0("same_strand_", pattern2)
                        } else {
                        ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
                        colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        ss$query_status <- paste0("same_strand_", pattern2)
                        }

      if(nrow(os) > 0) {
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        } else {
                        os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
                        colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        }
      
      df.plot=rbind(ss, os)
      df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
      
      # xy plot
      png(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
      print(xyplot(ratio ~ relative_distance,
                   data = df.plot, 
                   groups = query_status,
                   auto.key=list(space="right", points=TRUE),
                   aspect = 1,
                   xlim=c(-1,30),
                   ylim=c(0, 500),
                   xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
                   ylab="RSAT obs/exp Ratio",
                    main=paste0("GATA3 peak with ", motif),
                   between=list(y=1.0),
                   scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
                   par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
                   panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                              panel.xyplot(x, y, 
                                                           col=c("orange","darkgreen"), 
                                                           pch=18, 
                                                           cex=0.6,...)
                                              #panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
                                      
                                      
                 }))
     dev.off()
    }  
  } 
}
## [1] "motif_1"
## [1] "ATC"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_4"
## [1] "ATC"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_2"
## [1] "ATC"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_6"
## [1] "ATC"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_5"
## [1] "ATC"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 19: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 20: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 21: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 22: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 23: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

As expected, GATA3 peak with motif 1 and motif 2 do not show enrichment for dyad structure made of this pair of 3mer.
GATA3 peak with motif4 has enrichment at relative distance 8bp for ATC and same strand ATC;
GATA3 peak with motif6 has enrichment at relative distance 9bp for ATC and same strand ATC;
GATA3 peak with motif5 has enrichment at relative distance 2bp for ATC and opposite strand ATC.

These all match with the defined motif structure for each peak set.

5.1.2.6 TAG-ATA

Unfortunately, TAG is not included in our prioritized list. It is ranked even after AGA/TCT. We can include an analysis of this 3-mer here.
And in our customized analysis, we might also include TAG/CTA to our prioritized 3mer list.

library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("TAG")
Query_dyad =c("ATA")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')

for (motif in my_motifs) {
  print(motif)
  rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
  for (pattern1 in Anchor_dyad){
    print(pattern1)
    for (pattern2 in Query_dyad){
      print(pattern2)
      number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
      rc_pattern2=reverseComplement(DNAString(pattern2))
      print(rc_pattern2)
      number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
        
      ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
      os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
      
      if(nrow(ss) > 0) {
                        ss$query_status <- paste0("same_strand_", pattern2)
                        } else {
                        ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
                        colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        ss$query_status <- paste0("same_strand_", pattern2)
                        }

      if(nrow(os) > 0) {
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        } else {
                        os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
                        colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        }
      
      df.plot=rbind(ss, os)
      df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
      
      # xy plot
      png(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
      print(xyplot(ratio ~ relative_distance,
                   data = df.plot, 
                   groups = query_status,
                   auto.key=list(space="right", points=TRUE),
                   aspect = 1,
                   xlim=c(-1,30),
                   ylim=c(0, 500),
                   xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
                   ylab="RSAT obs/exp Ratio",
                    main=paste0("GATA3 peak with ", motif),
                   between=list(y=1.0),
                   scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
                   par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
                   panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                              panel.xyplot(x, y, 
                                                           col=c("orange","darkgreen"), 
                                                           pch=18, 
                                                           cex=0.6,...)
                                              #panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
                                      
                                      
                 }))
     dev.off()
    }  
  } 
}
## [1] "motif_1"
## [1] "TAG"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_4"
## [1] "TAG"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_2"
## [1] "TAG"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_6"
## [1] "TAG"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_5"
## [1] "TAG"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 24: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 25: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 26: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 27: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 28: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

It is interesting to see that for all peak set, we did not have dyad structure of TAG{}TAT on the same strand.
And for TAG-ATA on the same strand, we only see enrichment for GATA3 peaks with motif4 (rsat calculate the ratio is 68.72 ranked the third for tagn{0}ata). The relative distance between two zinc finger is 4. The binding element has to look like this: GATAGATA.

5.1.2.7 ATC-ATA

library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("ATC")
Query_dyad =c("ATA")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')

for (motif in my_motifs) {
  print(motif)
  rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
  for (pattern1 in Anchor_dyad){
    print(pattern1)
    for (pattern2 in Query_dyad){
      print(pattern2)
      number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
      rc_pattern2=reverseComplement(DNAString(pattern2))
      print(rc_pattern2)
      number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
        
      ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
      os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
      
      if(nrow(ss) > 0) {
                        ss$query_status <- paste0("same_strand_", pattern2)
                        } else {
                        ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
                        colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        ss$query_status <- paste0("same_strand_", pattern2)
                        }

      if(nrow(os) > 0) {
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        } else {
                        os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
                        colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        }
      
      df.plot=rbind(ss, os)
      df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
      
      # xy plot
      png(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
      print(xyplot(ratio ~ relative_distance,
                   data = df.plot, 
                   groups = query_status,
                   auto.key=list(space="right", points=TRUE),
                   aspect = 1,
                   xlim=c(-1,30),
                   ylim=c(0, 500),
                   xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
                   ylab="RSAT obs/exp Ratio",
                    main=paste0("GATA3 peak with ", motif),
                   between=list(y=1.0),
                   scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
                   par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
                   panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                              panel.xyplot(x, y, 
                                                           col=c("orange","darkgreen"), 
                                                           pch=18, 
                                                           cex=0.6,...)
                                              #panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
                                      
                                      
                 }))
     dev.off()
    }  
  } 
}
## [1] "motif_1"
## [1] "ATC"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_4"
## [1] "ATC"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_2"
## [1] "ATC"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_6"
## [1] "ATC"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_5"
## [1] "ATC"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 29: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 30: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 31: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 32: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 33: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

In these graph, we also did not see structure of ATC-TAT(opposite strand ATA). And only see one enriched structure for ATC-ATA at relative distance==2bp. This structure must be ATCxGATA, which match with motif5.

5.1.2.8 GAT-ATA

library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("GAT")
Query_dyad =c("ATA")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')

for (motif in my_motifs) {
  print(motif)
  rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
  for (pattern1 in Anchor_dyad){
    print(pattern1)
    for (pattern2 in Query_dyad){
      print(pattern2)
      number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
      rc_pattern2=reverseComplement(DNAString(pattern2))
      print(rc_pattern2)
      number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
        
      ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
      os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
      
      if(nrow(ss) > 0) {
                        ss$query_status <- paste0("same_strand_", pattern2)
                        } else {
                        ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
                        colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        ss$query_status <- paste0("same_strand_", pattern2)
                        }

      if(nrow(os) > 0) {
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        } else {
                        os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
                        colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
                        os$query_status <- paste0("opposite_strand_", pattern2)
                        }
      
      df.plot=rbind(ss, os)
      df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
      
      # xy plot
      png(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
      print(xyplot(ratio ~ relative_distance,
                   data = df.plot, 
                   groups = query_status,
                   auto.key=list(space="right", points=TRUE),
                   aspect = 1,
                   xlim=c(-1,30),
                   ylim=c(0, 500),
                   xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
                   ylab="RSAT obs/exp Ratio",
                    main=paste0("GATA3 peak with ", motif),
                   between=list(y=1.0),
                   scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
                   par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
                   panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                              panel.xyplot(x, y, 
                                                           col=c("orange","darkgreen"), 
                                                           pch=18, 
                                                           cex=0.6,...)
                                              #panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
                                      
                                      
                 }))
     dev.off()
    }  
  } 
}
## [1] "motif_1"
## [1] "GAT"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_4"
## [1] "GAT"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_2"
## [1] "GAT"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_6"
## [1] "GAT"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_5"
## [1] "GAT"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 34: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 35: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 36: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 37: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Figure 38: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure

Notice that we again did not see dyad structures of GAT-TAT(opposite stand ATA), but has identified enriched structure of GAT-ATA for GATA3 peaks with motif4 and motif6.

For GATA3 peaks with motif4, we see gatn{6}ata and gatn{2}ata enriched. The relative distance between two zinc fingers is 8bp and 4bp. The 8bp one match with moyif4, the 4bp one match with GATAGATA.
For GATA3 peaks with motif6, we see gatn{7}ata enriched, the relative distance between two zinc fingers is 9bp. This match with motif6 PWMs.

For the added 2 pairs of 3mer:

I have add the relative distance to the pattern_anchor_at_G_compute_dis.csv file.

pattern identifier anchor query
AGA AGA(t)  2   1
TCT (a)TCT  2   1
TAG (ga)TAG 5   -2
CTA CTA(tc) -1  4

5.1.3 Summary

Overall, RSAT-dyad analysis proves effective in identifying enriched binding elements within a set of peak regions, with proper controls and parameter settings.

Enriched dyad structures can be discerned through various combinations of 3-mers. Establishing the relative distance between zinc fingers serves as a valuable method to determine if a structure arises from the same binding element.

5.2 Customized analysis – prioritized 3mer

5.2.1 Closest 3mer to peak summits

Goal: given a prioritized 3mer list, we can generate the closest 3mer coordinates to a given sets of peak summits.

240208_closestBed.R:

(cd /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer)

#!/usr/bin/env Rscript

Args=commandArgs(TRUE)
# closestBed function
bedTools.closest <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
  
  options(scipen =99) # not use scientific notation when writing out
  
  #write bed formatted data.frames to tempfile
  write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
  write.table(bed2,file= 'b.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
  
  # create the command string and call the command using system()
  # the command sort a and b file by coordinates
  command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
  cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
  try(system(command1))
  command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
  cat(command2,"\n")
  try(system(command2))
  
  # the command call closestBed on bed1 and bed2
  command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
  cat(command,"\n")
  try(system(command))
  
  res=read.table('out.file.bed',header=F, comment.char='')
  
  # remove intermediate files
  command3=paste('rm', 'a.file.bed', 'b.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
  cat(command3,"\n")
  try(system(command3))
  
  colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
  return(res)
}

dir1="/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/"
dir2="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/MAST_positive_control/"
dir3="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/ENCODE_DHS_GSE29692/"
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
prioritized_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")
library(bigWig)

for (triplet in prioritized_triplets){
  print(triplet)
  # 3mer genome coordinates
  plus.triplet.file=read.table(file = Sys.glob(file.path(paste0(dir1,"hg38.3.3.3plus.*_",triplet, ".bed"))), sep="\t", header=FALSE)
  minus.triplet.file=read.table(file = Sys.glob(file.path(paste0(dir1,"hg38.3.3.3minus.*_",triplet ,".bed"))), sep="\t", header=FALSE)
  for (motif in my_motifs){
    print(motif)
    # peak summits
    GATA3_peak_summits=center.bed(read.table(paste0(dir2, "GATA3_peak_161win_with_",motif, ".bed"), header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
    # consensus neg
    indep.DHS.control.consensus=center.bed(read.table(paste0(dir3, "MCF7DHS_consensus_noGATA_without_motifs_123456_78.bed"), header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
    
    # closestBed--1st closest plus
    
    ##
    closest.1st.plus.triplet.to.peak=bedTools.closest(bed1 = GATA3_peak_summits[,1:3], bed2 = plus.triplet.file, opt.string = '-d -t first')
    write.table(closest.1st.plus.triplet.to.peak,file= paste0('closest.1st.plus.',triplet,'.to.GATA3.with.', motif,'.bed'), quote=F,sep="\t",col.names=F,row.names=F)
    
    ##
    closest.1st.plus.triplet.to.indep.DHS.control.consensus=bedTools.closest(bed1 = indep.DHS.control.consensus[,1:3], bed2 = plus.triplet.file, opt.string = '-d -t first')
    write.table(closest.1st.plus.triplet.to.indep.DHS.control.consensus,file= paste0('closest.1st.plus.',triplet,'.to.indep.DHS.control.consensus.bed'), quote=F,sep="\t",col.names=F,row.names=F)

     # closestBed--1st closest minus
     
     ##
     closest.1st.minus.triplet.to.peak=bedTools.closest(bed1 = GATA3_peak_summits[,1:3], bed2 =minus.triplet.file, opt.string = '-d -t first')
     write.table(closest.1st.minus.triplet.to.peak,file= paste0('closest.1st.minus.',triplet,'.to.GATA3.with.', motif,'.bed'), quote=F,sep="\t",col.names=F,row.names=F)
     
     ##
     closest.1st.minus.triplet.to.indep.DHS.control.consensus=bedTools.closest(bed1 = indep.DHS.control.consensus[,1:3], bed2 = minus.triplet.file, opt.string = '-d -t first')
    write.table(closest.1st.minus.triplet.to.indep.DHS.control.consensus,file= paste0('closest.1st.minus.',triplet,'.to.indep.DHS.control.consensus.bed'), quote=F,sep="\t",col.names=F,row.names=F)

  }
  
}

runR.sh

#!/bin/bash
#SBATCH --job-name=runR.sh     # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR.sh_%j.out
#SBATCH -e runR.sh_%j.err

module load R/4.1.2
Rscript 240208_closestBed.R

5.2.2 2nd closest 3mer to the anchor 3mer

Overall Goal: given the closest 3mer coordinates file, we can loop through a set of 3mer list and generate the 2nd closest 3mer coordinates to the provided closest 3mer coordinates.

5.2.2.1 sort the 3mer genome coordinates

sort_3mer_cor.sh

#!/bin/bash
#SBATCH --job-name=sort_3mer_cor.sh
#SBATCH -N 1                  
#SBATCH -n 1                 
#SBATCH -c 4                  
#SBATCH -p general           
#SBATCH --qos=general       
#SBATCH --mem=32G               
#SBATCH --mail-type=ALL 
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o sort_3mer_cor.sh_%j.out
#SBATCH -e sort_3mer_cor.sh_%j.err

hostname
name=XXXXXX

input_dir1=/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/

sort -k1,1 -k2,2n ${input_dir1}hg38.3.3.3${name}.bed > hg38.3.3.3${name}.sorted.bed
file=sort_3mer_cor.sh

for i in hg38.3.3.3*.bed 
do
    nm=$(echo $i | awk -F"/" '{print $NF}' | awk -F"hg38.3.3.3" '{print $2}' | awk -F".bed" '{print $1}')
    echo $nm
    sed -e "s/XXXXXX/${nm}/g" "$file" > sort_hg38.3.3.3${nm}.sh
    sbatch sort_hg38.3.3.3${nm}.sh
    sleep 1
done

5.2.2.2 check if bedtools subtract change the internal order of file

#ls /labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/*.sorted.bed
#subset a test file
awk '$1 == "chr4"' /labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/hg38.3.3.3plus.36_GAT.sorted.bed > hg38.3.3.3plus.36_GAT_subset_chr4.sorted.bed

wc -l hg38.3.3.3plus.36_GAT_subset_chr4.sorted.bed #2583363

wc -l /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/closest.1st.plus.GAT.to.GATA3.with.motif_1.bed #12470

awk '{print $4, $5, $6, $7, $8, $9, $10}' /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/closest.1st.plus.GAT.to.GATA3.with.motif_1.bed  | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.plus.GAT.to.GATA3.with.motif_1.uniq.sorted.bed

wc -l closest.1st.plus.GAT.to.GATA3.with.motif_1.uniq.sorted.bed #12461 #less than the original file because original file has two different peaks assigned with same closest 3mer coordinates

module load bedtools 

bedtools subtract -a hg38.3.3.3plus.36_GAT_subset_chr4.sorted.bed -b closest.1st.plus.GAT.to.GATA3.with.motif_1.uniq.sorted.bed -f 1.00 -s > substract_output.bed #2582612

sort -k1,1 -k2,2n substract_output.bed > substract_output.sorted.bed

diff substract_output.bed substract_output.sorted.bed
# no output printed on the screen, the two file are the same. 

bedtools subtract does not change the order of the input files.

5.2.2.3 For each 3mer genome coordinates, remove the identified closest 3mer on the same strand

Remove the first closest GAT with bedtools subtract
Subtract the 1st closest GAT from all.GAT, then find the closest 2nd GAT to the closest 1st GAT.
-f Requiring a minimal overlap fraction before subtracting. Here we define -f 1.00 to make sure of a 100% overlap between two file before subtracting.
-s Enforcing same “strandedness” while scanning for features in -b file that should be subtracted from -a file.

#!/bin/bash
#SBATCH --job-name=remove_1st_3mer.sh     # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o remove_1st_3mer.sh_%j.out
#SBATCH -e remove_1st_3mer.sh_%j.err


input_dir1=/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/
input_dir2=/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/

#output_dir=/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/closest_2nd_other_3mer/

my_motifs=("motif_1" "motif_2" "motif_4" "motif_5" "motif_6")
prioritized_triplets=("AAA" "TAA" "ATA" "TTA" "AAT" "TAT" "GAT" "ATT" "TTT" "ATC" "AGA" "TCT" "TAG" "CTA")

# Use a for loop to iterate over the 
module load bedtools 
for triplet in "${prioritized_triplets[@]}"
do
  echo $triplet
  
  for motif in "${my_motifs[@]}"
  do
    echo $motif
    # plus
    awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.plus.${triplet}.to.GATA3.with.${motif}.bed  | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.plus.${triplet}.to.GATA3.with.${motif}.uniq.sorted.bed
    
    bedtools subtract -a ${input_dir1}hg38.3.3.3plus.${triplet}.sorted.bed -b closest.1st.plus.${triplet}.to.GATA3.with.${motif}.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3plus.${triplet}_without_1st_plus_${triplet}_to_GATA3_with_${motif}.bed
    
    rm closest.1st.plus.${triplet}.to.GATA3.with.${motif}.uniq.sorted.bed
    
    # minus
    awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.minus.${triplet}.to.GATA3.with.${motif}.bed  | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.minus.${triplet}.to.GATA3.with.${motif}.uniq.sorted.bed
    
    bedtools subtract -a ${input_dir1}hg38.3.3.3minus.${triplet}.sorted.bed -b closest.1st.minus.${triplet}.to.GATA3.with.${motif}.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3minus.${triplet}_without_1st_minus_${triplet}_to_GATA3_with_${motif}.bed
  
    rm closest.1st.minus.${triplet}.to.GATA3.with.${motif}.uniq.sorted.bed
  done
done

# independent DHS control
for triplet in "${prioritized_triplets[@]}"
do
  echo $triplet
  # plus
  awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
  
  bedtools subtract -a ${input_dir1}hg38.3.3.3plus.${triplet}.sorted.bed -b closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3plus.36_${triplet}_without_1st_plus_${triplet}_to_indep_DHS_control.bed
  
   rm closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
   
  # minus
  awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
  
  bedtools subtract -a ${input_dir1}hg38.3.3.3minus.${triplet}.sorted.bed -b closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3minus.36_${triplet}_without_1st_minus_${triplet}_to_indep_DHS_control.bed

   rm closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
done

5.2.2.4 find the closest 2nd 3mer to peak summits

Identify redundant spaced 3mers, avoid showing duplicated results.

In our prioritized list, we have 14 unique 3mers, which can be described as 7 pairs consisting of a 3mer and its reverse complement.
Pairing them individually would yield 196 combinations. However, this approach conveys redundant information already captured by their same strand’s reverse complement. For instance, the plus strand ATC-TTT is equivalent to plus strand AAA-GAT, and AAA-AAA is equivalent to TTT-TTT on the same strand.

It is important to list all unique combinations of spaced 3-mers so that we do not show redundant information in the downstream analysis.

Additionally, in our calculation of relative distances, it’s essential to account for the upstream and downstream orientation of the 3mers. For example, (gat)AAA-TTA(tc) and TTA(tc)-(gat)AAA represent distinct configurations.

# function to convert the plus strand 6mer to its reverse compliment then convert to same strand
convert_plus_strand_6mer <- function(sequence) {
  # Define a dictionary to store complementary base pairs
  complement <- c("A" = "T", "T" = "A", "G" = "C", "C" = "G")
  
  # Reverse the input sequence
  reversed_sequence <- rev(strsplit(sequence, "")[[1]])
  
  # Get the complementary bases
  complementary_sequence <- sapply(reversed_sequence, function(base) complement[base])
  
  # Combine the complementary bases into a string
  converted_sequence <- paste(complementary_sequence, collapse = "")
  
  return(converted_sequence)
}

# List of prioritized triplets
prioritized_triplets <- c("AAA", "TAA", "ATA", "TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")

# List to store non-redundant 6mers
non_redundant_6mers <- list()
# Loop through each pair of triplets
for (i in 1:length(prioritized_triplets)) {
  for (j in 1:length(prioritized_triplets)) {
    pair <- c(prioritized_triplets[i], prioritized_triplets[j])
    # Combine triplets to form a 6mer
    sixmer <- paste(pair, collapse = "")
    
    # Find reverse complement of the 6mer
    converted_reverse_complement_sixmer <- convert_plus_strand_6mer(sixmer)
  
    # Check if the reverse complement exists in the 6mer itself
   if (!converted_reverse_complement_sixmer  %in% non_redundant_6mers) {
      non_redundant_6mers <- c(non_redundant_6mers, list(sixmer))
    }
  }
}

# Create dataframe with first 3 bases and last 3 bases
first_3_bases <- substr(non_redundant_6mers, 1, 3)
last_3_bases <- substr(non_redundant_6mers, 4, 6)
df <- data.frame(First_3_bases = first_3_bases, Last_3_bases = last_3_bases)
nrow(df)
## [1] 105

The above R code aims to generate non-redundant 6-mers from prioritized triplets and then creating a dataframe to represent them.

First, we define a convert_plus_strand_6mer function to reverse the input DNA sequence, finds the complementary bases for each base, and then combines them into a string representing the reverse complement.
Then we perform a nested loop, which iterates through each pair of prioritized triplets.
For each pair, it concatenates them to form a 6-mer, and then finds the reverse complement of the 6-mer using the convert_plus_strand_6mer function. If the reverse complement is already in non_redundant_6mers, we do not include the 6mer to the final output.

Through running this code, we identified 105 combinations of 3mer pairs. These pairs are unique while considering them to be on the same strand.

However, when considering the identification of 3mer combinations on both the plus and minus strands, it’s crucial to account for these combinations. If we limit our focus solely to 3mer-3mer pairs on the same strand, specifically the plus strand, each of the 196 combinations will be distinct from one another.

Notice that the relative distance here need to be carefully defined according to different 3mer.
I am using the bigWig package to anchor at the specific G/C base for each 3mer. The fiveprime.bed() defines the anchor point based on the strand information:
If strand = '+' while using fiveprime.bed
anchor point = original start
start = anchor point - upstreamwindow
end = anchor point + 1 + downstreamwindow
If strand = '-' while using fiveprime.bed
anchor point = original end
start = anchor point - downstreamwindow
end = anchor point + 1 + upstreamwindow

I have defined the upstream/downstream window value for each 3mer in the prioritized set, in this .csv file:

#pattern_anchor_at_GorC_for_bigWig_pkg.csv
read.csv('pattern_anchor_at_GorC_for_bigWig_pkg.csv')
##    pattern identifier plus_upstream plus_downstream minus_upstream
## 1      GAT        GAT             0               0             -1
## 2      ATC        ATC            -2               2             -3
## 3      ATA     (g)ATA             1              -1              0
## 4      TAT     TAT(c)            -3               3             -4
## 5      TTA    TTA(tc)            -4               4             -5
## 6      TAA    (ga)TAA             2              -2              1
## 7      AAT     AAT(c)            -3               3             -4
## 8      ATT     (g)ATT             1              -1              0
## 9      AAA   (gat)AAA             3              -3              2
## 10     TTT   TTT(atc)            -5               5             -6
## 11     AGA     AGA(t)            -1               1             -2
## 12     TCT     (a)TCT            -1               1             -2
## 13     TAG    (ga)TAG             2              -2              1
## 14     CTA    CTA(tc)            -4               4             -5
##    minus_downstream
## 1                 1
## 2                 3
## 3                 0
## 4                 4
## 5                 5
## 6                -1
## 7                 4
## 8                 0
## 9                -2
## 10                6
## 11                2
## 12                2
## 13               -1
## 14                5

Automate analysis on all prioritized 3mers: find the closest 2nd 3mer.

Anchor at plus strand closest 3mer, find the 2nd closest plus strand 3mer relative to the closest 3mer.

240225_closestBed.R

#!/usr/bin/env Rscript

Args=commandArgs(TRUE)

bedTools.closest.mod <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
  
  options(scipen =99) # not use scientific notation when writing out
  
  #write bed formatted data.frames to tempfile
  write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
  write.table(bed2,file= 'b.file.sorted.bed', quote=F,sep="\t",col.names=F,row.names=F)
  
  # create the command string and call the command using system()
  # the command sort a and b file by coordinates
  command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
  cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
  try(system(command1))
  #command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
  #cat(command2,"\n")
  #try(system(command2))
  
  # the command call closestBed on bed1 and bed2
  command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
  cat(command,"\n")
  try(system(command))
  
  res=read.table('out.file.bed',header=F, comment.char='')
  
  # remove intermediate files
  command3=paste('rm', 'a.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
  cat(command3,"\n")
  try(system(command3))
  
  colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
  return(res)
}

library(bigWig)

# List of prioritized triplets
prioritized_triplets <- c("AAA", "TAA", "ATA", "TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")

# List to store non-redundant 6mers
all_6mers <- list()
for (i in 1:length(prioritized_triplets)) {
  for (j in 1:length(prioritized_triplets)) {
    pair <- c(prioritized_triplets[i], prioritized_triplets[j])
    # Combine triplets to form a 6mer
    sixmer <- paste(pair, collapse = "")
    all_6mers <- c(all_6mers, list(sixmer))
    
  }
}

# Create data frame with first 3 bases and last 3 bases
first_3_bases <- substr(all_6mers, 1, 3)
last_3_bases <- substr(all_6mers, 4, 6)
df <- data.frame(First_3_bases = first_3_bases, Last_3_bases = last_3_bases)


# nested loop
dir1="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/"
dir2="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/closest_2nd_other_3mer/"


motif="XXXXXX"

  for (i in 1:nrow(df)){
    pattern1=df[i,1]
    pattern2=df[i,2]
    # anchor position: closest +/- 3mer, anchor at the first letter base
    print(pattern1)
    closest_plus_3mer_to_GATA3_peak_summits=fiveprime.bed(read.table(paste0(dir1, "closest.1st.plus.", pattern1, ".to.GATA3.with.", motif, ".bed"), header=FALSE)[,4:11], upstreamWindow = 0, downstreamWindow = 0)
  
    # query 3mer coordinates on genome (without the overlapped closest 3mer coordinates) - anchor at the first letter base
    print(pattern2)
    plus.3mer.file=fiveprime.bed(read.table(file = paste0(dir2, "hg38.3.3.3plus.", pattern2, "_without_1st_plus_", pattern2, "_to_GATA3_with_", motif,".bed"), sep="\t", header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
  
  # 2nd closest plus 3mer to closest plus 3mer
  closest.2nd.plus.3mer.to.1st.plus.3mer.GATA3.with.motif=bedTools.closest.mod(bed1 = closest_plus_3mer_to_GATA3_peak_summits[,1:3], bed2 = plus.3mer.file, opt.string = '-d -t first')

  
  write.table(closest.2nd.plus.3mer.to.1st.plus.3mer.GATA3.with.motif, file= paste0("closest.2nd.plus.", pattern2, ".to.1st.plus.", pattern1, ".GATA3.with.", motif, ".bed"), quote=F,sep="\t",col.names=F,row.names=F)
  }

runR.sh

#!/bin/bash
#SBATCH --job-name=runR_XXXXXX.sh     # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=64G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR_XXXXXX.sh_%j.out
#SBATCH -e runR_XXXXXX.sh_%j.err

hostname
name=XXXXXX

mkdir GATA3_peak_with_${name}
cd GATA3_peak_with_${name}

module load R/4.1.2
Rscript ../240225_closestBed_${name}.R

parallel running:

r_file=240225_closestBed.R
sh_file=runR.sh

my_motifs=("motif_1" "motif_2" "motif_4" "motif_5" "motif_6")

for nm in "${my_motifs[@]}"
do
    echo $nm
    sed -e "s/XXXXXX/${nm}/g" "$r_file" > 240225_closestBed_${nm}.R
done

for nm in "${my_motifs[@]}"
do
    echo $nm
    sed -e "s/XXXXXX/${nm}/g" "$sh_file" > runR_${nm}.sh
    sbatch runR_${nm}.sh
    sleep 1
done

240225_closestBed_DHS.R

#!/usr/bin/env Rscript

Args=commandArgs(TRUE)

bedTools.closest.mod <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
  
  options(scipen =99) # not use scientific notation when writing out
  
  #write bed formatted data.frames to tempfile
  write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
  write.table(bed2,file= 'b.file.sorted.bed', quote=F,sep="\t",col.names=F,row.names=F)
  
  # create the command string and call the command using system()
  # the command sort a and b file by coordinates
  command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
  cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
  try(system(command1))
  #command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
  #cat(command2,"\n")
  #try(system(command2))
  
  # the command call closestBed on bed1 and bed2
  command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
  cat(command,"\n")
  try(system(command))
  
  res=read.table('out.file.bed',header=F, comment.char='')
  
  # remove intermediate files
  command3=paste('rm', 'a.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
  cat(command3,"\n")
  try(system(command3))
  
  colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
  return(res)
}

library(bigWig)

# List of prioritized triplets
prioritized_triplets <- c("AAA", "TAA", "ATA", "TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")

# List to store non-redundant 6mers
all_6mers <- list()
for (i in 1:length(prioritized_triplets)) {
  for (j in 1:length(prioritized_triplets)) {
    pair <- c(prioritized_triplets[i], prioritized_triplets[j])
    # Combine triplets to form a 6mer
    sixmer <- paste(pair, collapse = "")
    all_6mers <- c(all_6mers, list(sixmer))
    
  }
}

# Create data frame with first 3 bases and last 3 bases
first_3_bases <- substr(all_6mers, 1, 3)
last_3_bases <- substr(all_6mers, 4, 6)
df <- data.frame(First_3_bases = first_3_bases, Last_3_bases = last_3_bases)


# nested loop
dir1="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/"
dir2="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/closest_2nd_other_3mer/"


# DHS regions
for (i in 1:nrow(df)){
    pattern1=df[i,1]
    pattern2=df[i,2]
    # anchor position: closest +/- pattern1 relative to G/C
    print(pattern1)
    closest_plus_3mer_to_DHS=fiveprime.bed(read.table(paste0(dir1, "closest.1st.plus.", pattern1, ".to.indep.DHS.control.consensus.bed"), header=FALSE)[,4:11], upstreamWindow = 0, downstreamWindow = 0)
  
    # query 3mer coordinates on genome (without the overlapped closest 3mer coordinates) - relative to G/C
    print(pattern2)
    plus.3mer.file=fiveprime.bed(read.table(file = paste0(dir2, "hg38.3.3.3plus.36_", pattern2, "_without_1st_plus_", pattern2, "_to_indep_DHS_control.bed"), sep="\t", header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
  
  # 2nd closest plus 3mer to closest plus 3mer
  closest.2nd.plus.3mer.to.1st.plus.3mer.DHS=bedTools.closest.mod(bed1 = closest_plus_3mer_to_DHS[,1:3], bed2 = plus.3mer.file, opt.string = '-d -t first')

  
  write.table(closest.2nd.plus.3mer.to.1st.plus.3mer.DHS, file= paste0("closest.2nd.plus.", pattern2, ".to.1st.plus.", pattern1, ".indep.DHS.control.bed"), quote=F,sep="\t",col.names=F,row.names=F)
  }

runR_DHS.sh

#!/bin/bash
#SBATCH --job-name=runR_DHS.sh     # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR_DHS.sh_%j.out
#SBATCH -e runR_DHS.sh_%j.err

hostname
mkdir DHS
cd DHS

module load R/4.1.2
Rscript ../240225_closestBed_DHS.R

coherence check: check some coordinates on UCSC genome browser, making sure that we are anchoring at the correct G/C.

For some 3mers, we are empirically determine a relative G/C considering the GATA3 PWMs. So it is normal to see that some of the anchorred positions are not exactly an G or an C.

6 Plot the enrichement frequencies

6.1 GATA3 peak with motifs

Make minimal plots to communicate results.

For each peak set, (5 positive GATA3 peak sets and one DHS independent regions), there are 196 unique 3mer pairs with different relative distances.
Since these 3mer pairs are prioritized based on their potential enrichment in GATA3 peak sets, they might convey similar enrichment information. Thus we could plot bw plots at different distances, showing which distance they all prefer.

A density plot will have 105 traces for each peak set.

Or anchor at 1st closest GAT, for instance, then plot traces of 2nd closest 3mer relative to this GAT

6.1.1 motif1

ls *to.1st.plus.GAT.GATA3.with.motif_1.bed
wc -l *to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_1.bed
##    12470 closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_1.bed
##    12470 closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
##    12470 closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_1.bed
##    12470 closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_1.bed
##    12470 closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_1.bed
##    12470 closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_1.bed
##    12470 closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_1.bed
##    12470 closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
##    12470 closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_1.bed
##    12470 closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_1.bed
##    12470 closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
##    12470 closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_1.bed
##    12470 closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_1.bed
##    12470 closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_1.bed
##   174580 total

Prepare data:

calculate_actual_frequency <- function(data) {
  # Use table() to create a frequency table
  actual_frequencies <- table(data)/length(data)
  result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
  return(result)
}


#my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
my_motifs = c("motif_1")
Anchor_triplets = c("GAT")
Query_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")

for (motif in my_motifs){
  print(motif)
  df.all=data.frame(matrix(nrow = 0, ncol = 6))  
  for (Anchor_triplet in Anchor_triplets){
    print(Anchor_triplet)
    df.peak.dis.all=data.frame(matrix(nrow = 0, ncol = 6))  
  for (Query_triplet in Query_triplets){
    print(Query_triplet)
    df.peak.dis = data.frame(matrix(nrow = 0, ncol = 6))     
    colnames(df.peak.dis) = c("dis","anchor_3mer", "query_3mer", "status", "abs.dis", "actual_freq")
    for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd.plus.", Query_triplet, ".to.1st.plus.", Anchor_triplet, ".GATA3.with.", motif, ".bed")))) {
      print(closest_2nd_dis)
      temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], Anchor_triplet, Query_triplet, motif)) 
      colnames(temp) = c("dis", "anchor_3mer", "query_3mer", "status")
      temp$dis=as.integer(temp$dis)
      temp$abs.dis=abs(temp$dis)
      actual_frequencies = calculate_actual_frequency(temp$abs.dis)
      temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
      df.peak.dis = rbind(df.peak.dis,temp1)
    }
    df.peak.dis.all=rbind(df.peak.dis.all, df.peak.dis)
  }
    df.all=rbind(df.all, df.peak.dis.all)
  }
}
## [1] "motif_1"
## [1] "GAT"
## [1] "AAA"
## [1] "./closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "TAA"
## [1] "./closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "ATA"
## [1] "./closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "TTA"
## [1] "./closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "AAT"
## [1] "./closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "TAT"
## [1] "./closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "GAT"
## [1] "./closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "ATT"
## [1] "./closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "TTT"
## [1] "./closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "ATC"
## [1] "./closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "AGA"
## [1] "./closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "TCT"
## [1] "./closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "TAG"
## [1] "./closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "CTA"
## [1] "./closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_1.bed"
str(df.all)
## 'data.frame':    174580 obs. of  6 variables:
##  $ abs.dis    : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ dis        : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ anchor_3mer: chr  "GAT" "GAT" "GAT" "GAT" ...
##  $ query_3mer : chr  "AAA" "AAA" "AAA" "AAA" ...
##  $ status     : chr  "motif_1" "motif_1" "motif_1" "motif_1" ...
##  $ actual_freq: num  0.0573 0.0573 0.0573 0.0573 0.0573 ...
unique(df.all$anchor_3mer)
## [1] "GAT"
unique(df.all$query_3mer)
##  [1] "AAA" "TAA" "ATA" "TTA" "AAT" "TAT" "GAT" "ATT" "TTT" "ATC" "AGA" "TCT"
## [13] "TAG" "CTA"
nrow(df.all)
## [1] 174580
head(df.all)
##   abs.dis dis anchor_3mer query_3mer  status actual_freq
## 1       3   3         GAT        AAA motif_1  0.05725742
## 2       3   3         GAT        AAA motif_1  0.05725742
## 3       3   3         GAT        AAA motif_1  0.05725742
## 4       3   3         GAT        AAA motif_1  0.05725742
## 5       3   3         GAT        AAA motif_1  0.05725742
## 6       3   3         GAT        AAA motif_1  0.05725742
df.all.unique=df.all[!duplicated(df.all), ]
df.all.unique$pattern=paste0(df.all.unique$anchor_3mer, "-", df.all.unique$query_3mer)

unique(df.all.unique$pattern)
##  [1] "GAT-AAA" "GAT-TAA" "GAT-ATA" "GAT-TTA" "GAT-AAT" "GAT-TAT" "GAT-GAT"
##  [8] "GAT-ATT" "GAT-TTT" "GAT-ATC" "GAT-AGA" "GAT-TCT" "GAT-TAG" "GAT-CTA"
df.all.unique$pattern = factor(df.all.unique$pattern, levels = c("GAT-AAA" ,"GAT-TAA", "GAT-ATA", "GAT-TTA", "GAT-AAT", "GAT-TAT", "GAT-GAT", "GAT-ATT", "GAT-TTT", "GAT-ATC", "GAT-AGA", "GAT-TCT", "GAT-TAG", "GAT-CTA"))
library(lattice)
library(latticeExtra)
#pdf(paste0('xy_closest_2nd_GAT_to_closest_1st_GAT_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=15,height=5)
#print(
  xyplot(actual_freq ~ abs.dis, 
         data = df.all.unique, 
         groups = pattern,
         auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.3),
         #type = c('p', 'smooth'),
         xlab = "relative distance between two triplets (bp)",
         ylab="Frequency of Enrichment",
         main="GATA3 peak with motif1",
         between=list(y=1.0),
         scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.line = list(col=c("blue", "red", "green", "cyan", "magenta", "yellow", 
            "orange", "purple", "pink", "darkgreen", "purple4", 
            "brown", "slategray", "darkolivegreen"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 15, 1)), col = "grey90")
                                    panel.xyplot(x, y, 
                                                 #col=c(colorRampPalette(c("red","blue"))(14)), 
                                                 col=c("blue", "red", "green", "cyan", "magenta", "yellow", 
            "orange", "purple", "pink", "darkgreen", "purple4", 
            "brown", "slategray", "darkolivegreen"),
                                                 pch=18, 
                                                 cex=0.6,...)
                                    
                                      
                                      
  })

#)
#dev.off()

Compare to DHS:

ls *to.1st.plus.GAT.indep.DHS.control.bed
wc -l *to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.indep.DHS.control.bed
##    57906 closest.2nd.plus.AAA.to.1st.plus.GAT.indep.DHS.control.bed
##    57906 closest.2nd.plus.AAT.to.1st.plus.GAT.indep.DHS.control.bed
##    57906 closest.2nd.plus.AGA.to.1st.plus.GAT.indep.DHS.control.bed
##    57906 closest.2nd.plus.ATA.to.1st.plus.GAT.indep.DHS.control.bed
##    57906 closest.2nd.plus.ATC.to.1st.plus.GAT.indep.DHS.control.bed
##    57906 closest.2nd.plus.ATT.to.1st.plus.GAT.indep.DHS.control.bed
##    57906 closest.2nd.plus.CTA.to.1st.plus.GAT.indep.DHS.control.bed
##    57906 closest.2nd.plus.GAT.to.1st.plus.GAT.indep.DHS.control.bed
##    57906 closest.2nd.plus.TAA.to.1st.plus.GAT.indep.DHS.control.bed
##    57906 closest.2nd.plus.TAG.to.1st.plus.GAT.indep.DHS.control.bed
##    57906 closest.2nd.plus.TAT.to.1st.plus.GAT.indep.DHS.control.bed
##    57906 closest.2nd.plus.TCT.to.1st.plus.GAT.indep.DHS.control.bed
##    57906 closest.2nd.plus.TTA.to.1st.plus.GAT.indep.DHS.control.bed
##    57906 closest.2nd.plus.TTT.to.1st.plus.GAT.indep.DHS.control.bed
##   810684 total

DHS:

Anchor_triplets = c("GAT")
Query_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")

df.all.DHS=data.frame(matrix(nrow = 0, ncol = 6))  
for (Anchor_triplet in Anchor_triplets){
  print(Anchor_triplet)
  df.peak.dis.all.DHS=data.frame(matrix(nrow = 0, ncol = 6)) 
  for (Query_triplet in Query_triplets){
    print(Query_triplet)
    df.DHS.dis = data.frame(matrix(nrow = 0, ncol = 6))     
    colnames(df.DHS.dis) = c("dis","anchor_3mer", "query_3mer", "status", "abs.dis", "actual_freq")
    for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd.plus.", Query_triplet, ".to.1st.plus.", Anchor_triplet, ".indep.DHS.control.bed")))) {
      print(closest_2nd_dis)
      temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], Anchor_triplet, Query_triplet, "indep.DHS.control")) 
      colnames(temp) = c("dis", "anchor_3mer", "query_3mer", "status")
      temp$dis=as.integer(temp$dis)
      temp$abs.dis=abs(temp$dis)
      actual_frequencies = calculate_actual_frequency(temp$abs.dis)
      temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
      df.DHS.dis = rbind(df.DHS.dis, temp1)
  }
    df.peak.dis.all.DHS=rbind(df.peak.dis.all.DHS, df.DHS.dis)
  }
    df.all.DHS=rbind(df.all.DHS, df.peak.dis.all.DHS)
  }


str(df.all.DHS)
unique(df.all.DHS$anchor_3mer)
unique(df.all.DHS$query_3mer)
nrow(df.all.DHS)
head(df.all.DHS)
df.all.DHS.unique=df.all.DHS[!duplicated(df.all.DHS), ]
df.all.DHS.unique$pattern=paste0(df.all.DHS.unique$anchor_3mer, "-", df.all.DHS.unique$query_3mer)

unique(df.all.DHS.unique$pattern)
df.all.DHS.unique$pattern = factor(df.all.DHS.unique$pattern, levels = c("GAT-AAA" ,"GAT-TAA", "GAT-ATA", "GAT-TTA", "GAT-AAT", "GAT-TAT", "GAT-GAT", "GAT-ATT", "GAT-TTT", "GAT-ATC", "GAT-AGA", "GAT-TCT", "GAT-TAG", "GAT-CTA"))
df.plot=rbind(df.all.unique, df.all.DHS.unique)
df.plot$status= factor(df.plot$status, levels=c("motif_1", "indep.DHS.control"))
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_2nd_XXX_to_closest_1st_GAT_to_GATA3_pos_motif1_compare_to_DHS.pdf'), width=10,height=10)
print(
  xyplot(actual_freq ~ abs.dis | pattern, 
         data = df.plot, 
         groups = status,
         auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.3),
         layout=c(4,4),
         #type = c('p', 'smooth'),
         xlab = "relative distance between two triplets (bp)",
         ylab="Frequency of Enrichment",
         main="GATA3 peak with motif1 (red) vs. DHS regions (black)",
         between=list(x=1.0, y=1.0),
         scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 15, 1)), col = "grey90")
                                    panel.xyplot(x, y, 
                                                 #col=c(colorRampPalette(c("red","blue"))(14)), 
                                                 col=c("red", "black"),
                                                 pch=18, 
                                                 cex=0.6,...)
                                    
                                      
                                      
  })
)
dev.off()

6.1.2 motif2456

calculate_actual_frequency <- function(data) {
  # Use table() to create a frequency table
  actual_frequencies <- table(data)/length(data)
  result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
  return(result)
}


my_motifs = c("motif_4","motif_2", "motif_6", "motif_5")
Anchor_triplets = c("GAT")
Query_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")

df.all=data.frame(matrix(nrow = 0, ncol = 6))  
for (motif in my_motifs){
  print(motif)
  
  for (Anchor_triplet in Anchor_triplets){
    print(Anchor_triplet)
    df.peak.dis.all=data.frame(matrix(nrow = 0, ncol = 6))
  for (Query_triplet in Query_triplets){
    print(Query_triplet)
    df.peak.dis = data.frame(matrix(nrow = 0, ncol = 6)) 
    colnames(df.peak.dis) = c("dis","anchor_3mer", "query_3mer", "status", "abs.dis", "actual_freq")
    for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd.plus.", Query_triplet, ".to.1st.plus.", Anchor_triplet, ".GATA3.with.", motif, ".bed")))) {
      print(closest_2nd_dis)
      temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], Anchor_triplet, Query_triplet, motif)) 
      colnames(temp) = c("dis", "anchor_3mer", "query_3mer", "status")
      temp$dis=as.integer(temp$dis)
      temp$abs.dis=abs(temp$dis)
      actual_frequencies = calculate_actual_frequency(temp$abs.dis)
      temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
      df.peak.dis = rbind(df.peak.dis, temp1)
    }
    df.peak.dis.all=rbind(df.peak.dis.all, df.peak.dis)
  }
   
  }
   df.all=rbind(df.all, df.peak.dis.all)
}


str(df.all)
unique(df.all$anchor_3mer)
unique(df.all$query_3mer)
unique(df.all$status)
nrow(df.all)
head(df.all)
df.all.unique=df.all[!duplicated(df.all), ]
df.all.unique$pattern=paste0(df.all.unique$anchor_3mer, "-", df.all.unique$query_3mer)

unique(df.all.unique$pattern)
df.all.unique$pattern = factor(df.all.unique$pattern, levels = c("GAT-AAA" ,"GAT-TAA", "GAT-ATA", "GAT-TTA", "GAT-AAT", "GAT-TAT", "GAT-GAT", "GAT-ATT", "GAT-TTT", "GAT-ATC", "GAT-AGA", "GAT-TCT", "GAT-TAG", "GAT-CTA"))

unique(df.all.unique$status)
df.all.unique$status = factor(df.all.unique$status, levels = c("motif_4","motif_2", "motif_6", "motif_5"))
my_motifs = c("motif_4","motif_2", "motif_6", "motif_5")
for (motif in my_motifs){
  print(motif)
  df.plot=rbind(df.all.unique[df.all.unique$status==motif, ], df.all.DHS.unique)
  df.plot$status= factor(df.plot$status, levels=c(motif, "indep.DHS.control"))
  library(lattice)
  library(latticeExtra)
  pdf(paste0('xy_closest_2nd_XXX_to_closest_1st_GAT_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=10,height=10)
print(
  xyplot(actual_freq ~ abs.dis | pattern, 
         data = df.plot, 
         groups = status,
         auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.3),
         layout=c(4,4),
         #type = c('p', 'smooth'),
         xlab = "relative distance between two triplets (bp)",
         ylab="Frequency of Enrichment",
         main=paste0("GATA3 peak with ", motif, " (red) vs. DHS regions (black)"),
         between=list(x=1.0, y=1.0),
         scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 15, 1)), col = "grey90")
                                    panel.xyplot(x, y, 
                                                 #col=c(colorRampPalette(c("red","blue"))(14)), 
                                                 col=c("red", "black"),
                                                 pch=18, 
                                                 cex=0.6,...)
                                    
                                      
                                      
  })
)
dev.off()
}

6.1.3 motif 5 peak set anchor at ATC

Prepare data:

calculate_actual_frequency <- function(data) {
  # Use table() to create a frequency table
  actual_frequencies <- table(data)/length(data)
  result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
  return(result)
}


#my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
my_motifs = c("motif_5")
Anchor_triplets = c("ATC")
Query_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")

for (motif in my_motifs){
  print(motif)
  df.all=data.frame(matrix(nrow = 0, ncol = 6))  
  for (Anchor_triplet in Anchor_triplets){
    print(Anchor_triplet)
    df.peak.dis.all=data.frame(matrix(nrow = 0, ncol = 6))  
  for (Query_triplet in Query_triplets){
    print(Query_triplet)
    df.peak.dis = data.frame(matrix(nrow = 0, ncol = 6))     
    colnames(df.peak.dis) = c("dis","anchor_3mer", "query_3mer", "status", "abs.dis", "actual_freq")
    for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd.plus.", Query_triplet, ".to.1st.plus.", Anchor_triplet, ".GATA3.with.", motif, ".bed")))) {
      print(closest_2nd_dis)
      temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], Anchor_triplet, Query_triplet, motif)) 
      colnames(temp) = c("dis", "anchor_3mer", "query_3mer", "status")
      temp$dis=as.integer(temp$dis)
      temp$abs.dis=abs(temp$dis)
      actual_frequencies = calculate_actual_frequency(temp$abs.dis)
      temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
      df.peak.dis = rbind(df.peak.dis,temp1)
    }
    df.peak.dis.all=rbind(df.peak.dis.all, df.peak.dis)
  }
    df.all=rbind(df.all, df.peak.dis.all)
  }
}

str(df.all)
unique(df.all$anchor_3mer)
unique(df.all$query_3mer)
nrow(df.all)
head(df.all)
df.all.unique=df.all[!duplicated(df.all), ]
df.all.unique$pattern=paste0(df.all.unique$anchor_3mer, "-", df.all.unique$query_3mer)

unique(df.all.unique$pattern)
df.all.unique$pattern = as.factor(df.all.unique$pattern)

DHS:

Anchor_triplets = c("ATC")
Query_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")

df.all.DHS=data.frame(matrix(nrow = 0, ncol = 6))  
for (Anchor_triplet in Anchor_triplets){
  print(Anchor_triplet)
  df.peak.dis.all.DHS=data.frame(matrix(nrow = 0, ncol = 6)) 
  for (Query_triplet in Query_triplets){
    print(Query_triplet)
    df.DHS.dis = data.frame(matrix(nrow = 0, ncol = 6))     
    colnames(df.DHS.dis) = c("dis","anchor_3mer", "query_3mer", "status", "abs.dis", "actual_freq")
    for (closest_2nd_dis in Sys.glob(file.path(paste0("../DHS/closest.2nd.plus.", Query_triplet, ".to.1st.plus.", Anchor_triplet, ".indep.DHS.control.bed")))) {
      print(closest_2nd_dis)
      temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], Anchor_triplet, Query_triplet, "indep.DHS.control")) 
      colnames(temp) = c("dis", "anchor_3mer", "query_3mer", "status")
      temp$dis=as.integer(temp$dis)
      temp$abs.dis=abs(temp$dis)
      actual_frequencies = calculate_actual_frequency(temp$abs.dis)
      temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
      df.DHS.dis = rbind(df.DHS.dis, temp1)
  }
    df.peak.dis.all.DHS=rbind(df.peak.dis.all.DHS, df.DHS.dis)
  }
    df.all.DHS=rbind(df.all.DHS, df.peak.dis.all.DHS)
  }


str(df.all.DHS)
unique(df.all.DHS$anchor_3mer)
unique(df.all.DHS$query_3mer)
nrow(df.all.DHS)
head(df.all.DHS)
df.all.DHS.unique=df.all.DHS[!duplicated(df.all.DHS), ]
df.all.DHS.unique$pattern=paste0(df.all.DHS.unique$anchor_3mer, "-", df.all.DHS.unique$query_3mer)

unique(df.all.DHS.unique$pattern)
df.all.DHS.unique$pattern = as.factor(df.all.DHS.unique$pattern)
df.plot=rbind(df.all.unique, df.all.DHS.unique)
  df.plot$status= factor(df.plot$status, levels=c(motif, "indep.DHS.control"))
  library(lattice)
  library(latticeExtra)
  pdf(paste0('xy_closest_2nd_XXX_to_closest_1st_ATC_to_GATA3_pos_motif_5_compare_to_DHS.pdf'), width=10,height=10)
print(
  xyplot(actual_freq ~ abs.dis | pattern, 
         data = df.plot, 
         groups = status,
         auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.3),
         layout=c(4,4),
         #type = c('p', 'smooth'),
         xlab = "relative distance between two triplets (bp)",
         ylab="Frequency of Enrichment",
         main=paste0("GATA3 peak with ", motif, " (red) vs. DHS regions (black)"),
         between=list(x=1.0, y=1.0),
         scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 15, 1)), col = "grey90")
                                    panel.xyplot(x, y, 
                                                 #col=c(colorRampPalette(c("red","blue"))(14)), 
                                                 col=c("red", "black"),
                                                 pch=18, 
                                                 cex=0.6,...)
                                    
                                      
                                      
  })
)
dev.off()

6.2 GATA3 peak without motifs

ls *.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
wc -l  *.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
##    37308 closest.2nd.plus.AAA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
##    37308 closest.2nd.plus.AAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
##    37308 closest.2nd.plus.AGA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
##    37308 closest.2nd.plus.ATA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
##    37308 closest.2nd.plus.ATC.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
##    37308 closest.2nd.plus.ATT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
##    37308 closest.2nd.plus.CTA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
##    37308 closest.2nd.plus.GAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
##    37308 closest.2nd.plus.TAA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
##    37308 closest.2nd.plus.TAG.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
##    37308 closest.2nd.plus.TAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
##    37308 closest.2nd.plus.TCT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
##    37308 closest.2nd.plus.TTA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
##    37308 closest.2nd.plus.TTT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
##   522312 total
calculate_actual_frequency <- function(data) {
  # Use table() to create a frequency table
  actual_frequencies <- table(data)/length(data)
  result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
  return(result)
}

Anchor_triplets = c("GAT")
#Anchor_triplets = c("ATC")
Query_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")

df.all=data.frame(matrix(nrow = 0, ncol = 6))  

  for (Anchor_triplet in Anchor_triplets){
    print(Anchor_triplet)
    df.peak.dis.all=data.frame(matrix(nrow = 0, ncol = 6))
  for (Query_triplet in Query_triplets){
    print(Query_triplet)
    df.peak.dis = data.frame(matrix(nrow = 0, ncol = 6)) 
    colnames(df.peak.dis) = c("dis","anchor_3mer", "query_3mer", "status", "abs.dis", "actual_freq")
    for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd.plus.", Query_triplet, ".to.1st.plus.", Anchor_triplet, ".to.GATA3_without_motifs_123456_78.bed")))) {
      print(closest_2nd_dis)
      temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], Anchor_triplet, Query_triplet, "peak_without_motif")) 
      colnames(temp) = c("dis", "anchor_3mer", "query_3mer", "status")
      temp$dis=as.integer(temp$dis)
      temp$abs.dis=abs(temp$dis)
      actual_frequencies = calculate_actual_frequency(temp$abs.dis)
      temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
      df.peak.dis = rbind(df.peak.dis, temp1)
    }
    df.peak.dis.all=rbind(df.peak.dis.all, df.peak.dis)
  }
   df.all=rbind(df.all, df.peak.dis.all)
  }
   



str(df.all)
unique(df.all$anchor_3mer)
unique(df.all$query_3mer)
unique(df.all$status)
nrow(df.all)
head(df.all)
df.all.unique=df.all[!duplicated(df.all), ]
df.all.unique$pattern=paste0(df.all.unique$anchor_3mer, "-", df.all.unique$query_3mer)

unique(df.all.unique$pattern)
##  [1] "GAT-AAA" "GAT-TAA" "GAT-ATA" "GAT-TTA" "GAT-AAT" "GAT-TAT" "GAT-GAT"
##  [8] "GAT-ATT" "GAT-TTT" "GAT-ATC" "GAT-AGA" "GAT-TCT" "GAT-TAG" "GAT-CTA"
df.all.unique$pattern = factor(df.all.unique$pattern, levels = c("GAT-AAA" ,"GAT-TAA", "GAT-ATA", "GAT-TTA", "GAT-AAT", "GAT-TAT", "GAT-GAT", "GAT-ATT", "GAT-TTT", "GAT-ATC", "GAT-AGA", "GAT-TCT", "GAT-TAG", "GAT-CTA"))

unique(df.all.unique$status)
## [1] "motif_1"
df.plot=rbind(df.all.unique, df.all.DHS.unique)
df.plot$status= factor(df.plot$status, levels=c("peak_without_motif", "indep.DHS.control"))
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_2nd_XXX_to_closest_1st_GAT_to_GATA3_without_motifs_compare_to_DHS.pdf'), width=10,height=10)
print(
  xyplot(actual_freq ~ abs.dis | pattern, 
         data = df.plot, 
         groups = status,
         auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.3),
         layout=c(4,4),
         #type = c('p', 'smooth'),
         xlab = "relative distance between two triplets (bp)",
         ylab="Frequency of Enrichment",
         main="GATA3 peak without motifs (red) vs. DHS regions (black)",
         between=list(x=1.0, y=1.0),
         scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 15, 1)), col = "grey90")
                                    panel.xyplot(x, y, 
                                                 #col=c(colorRampPalette(c("red","blue"))(14)), 
                                                 col=c("red", "black"),
                                                 pch=18, 
                                                 cex=0.6,...)
                                    
                                      
                                      
  })
)
dev.off()

6.3 Heatmap

A heatmap with 14 3mer as x and y axies, and color code the density at different distances.
A heatmap, y axies is all peaks ranked by intensity, then plot the 1st 3mer and 2nd 3mer coordinates for each peak.

“heatmap_test.txt” stores a subset of data with distance info from closest +GAT for peaks with motif1 summit, and distance info from 2nd to 1st closest GAT for peaks with motif1.

df <- read.table("heatmap_test.txt", header=T)[,c(1,3)]

create_matrix=function(df, dis.bound){
  df.in.bound=df[!(rowSums(df > dis.bound | df < -dis.bound) > 0), ]
  nrows=nrow(df.in.bound)
  df.in.bound=df.in.bound[order(rank(-abs(df.in.bound[,1]))), ]
  matrix=matrix(0, nrow = nrows, ncol =(2*dis.bound+1))
  for (i in 1:nrow(df.in.bound)){
    distance1=df.in.bound[i, 1]
    distance2=df.in.bound[i, 2]
    index1=distance1+dis.bound
    index2=distance2+dis.bound
    matrix[i, index1]=1
    matrix[i, index2]=0.6
  }
  return(matrix)
}

mat=create_matrix(df, 100)
mat_df <- as.data.frame(as.table(mat))
names(mat_df) <- c( "Peak", "Distance", "Value")

# Plot
#pdf("test_heatmap_spaced_GAT_distance_to_peak_summit.pdf")
#print(
levelplot(Value ~ Distance * Peak, data = mat_df,
          col.regions = colorRampPalette(c( "white","blue", "red")),
          aspect=2,
          at = seq(0, 1, length=150),
          axes = FALSE,
          sub="",
          colorkey = FALSE,
          region = TRUE,
          scales = list(draw = FALSE),
          xlab = "Distance to peak summit", ylab = "GATA3 Peaks with motif1", main = "closest GAT (red) and 2nd closest GAT (blue) Distances to peak summit",
          newpage = FALSE,
          panel = function(...) {panel.levelplot(...)
            panel.abline(v=100, col = "black")}
         )

#)
#dev.off()

RSAT:
Confirm how the software counts the observed spaced 3mer. Will they count twice if seeing 2 spaced 3mer at different locus on one sequence? – yes, see the coherence check section under RSAT.
Combine results regardless of upstream or downstream orientation.

6.4 GATA3 peaks without motifs

6.4.1 GAT-GAT/ATC

without_motifs_123456_78_161bp_mast.bed
37308 peaks.

step1: find the 3mer coordinates (on hg38 genome) that are closest to each peak summit with closestBed.
step2: remove the closest 3mer coordinates from the whole genome 3mer coordinates on the same strand with bedtools subtract.
step3: find the second closest 3mer relative to the closest one.

My list of 3mer: GAT, ATC.

6.4.1.1 STEP1:

240218_closestBed.R:

(cd /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/GAT_ATC_240218)

#!/usr/bin/env Rscript

Args=commandArgs(TRUE)
# closestBed function
bedTools.closest <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
  
  options(scipen =99) # not use scientific notation when writing out
  
  #write bed formatted data.frames to tempfile
  write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
  write.table(bed2,file= 'b.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
  
  # create the command string and call the command using system()
  # the command sort a and b file by coordinates
  command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
  cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
  try(system(command1))
  command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
  cat(command2,"\n")
  try(system(command2))
  
  # the command call closestBed on bed1 and bed2
  command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
  cat(command,"\n")
  try(system(command))
  
  res=read.table('out.file.bed',header=F, comment.char='')
  
  # remove intermediate files
  command3=paste('rm', 'a.file.bed', 'b.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
  cat(command3,"\n")
  try(system(command3))
  
  colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
  return(res)
}

dir1="/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/"
dir2="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/"
dir3="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/ENCODE_DHS_GSE29692/"

prioritized_triplets = c("GAT", "ATC")
library(bigWig)

for (triplet in prioritized_triplets){
  print(triplet)
  # 3mer genome coordinates
  plus.triplet.file=read.table(file = Sys.glob(file.path(paste0(dir1,"hg38.3.3.3plus.*_",triplet, ".bed"))), sep="\t", header=FALSE)
  minus.triplet.file=read.table(file = Sys.glob(file.path(paste0(dir1,"hg38.3.3.3minus.*_",triplet ,".bed"))), sep="\t", header=FALSE)

    # peak summits
    GATA3_peak_summits=center.bed(read.table(paste0(dir2, "without_motifs_123456_78_161bp_mast.bed"), header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
    # consensus neg
    indep.DHS.control.consensus=center.bed(read.table(paste0(dir3, "MCF7DHS_consensus_noGATA_without_motifs_123456_78.bed"), header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
    
    # closestBed--1st closest plus
    
    ##
    closest.1st.plus.triplet.to.peak=bedTools.closest(bed1 = GATA3_peak_summits[,1:3], bed2 = plus.triplet.file, opt.string = '-d -t first')
    write.table(closest.1st.plus.triplet.to.peak,file= paste0('closest.1st.plus.',triplet,'.to.GATA3_without_motifs_123456_78_161bp_mast.bed'), quote=F,sep="\t",col.names=F,row.names=F)
    
    ##
    closest.1st.plus.triplet.to.indep.DHS.control.consensus=bedTools.closest(bed1 = indep.DHS.control.consensus[,1:3], bed2 = plus.triplet.file, opt.string = '-d -t first')
    write.table(closest.1st.plus.triplet.to.indep.DHS.control.consensus,file= paste0('closest.1st.plus.',triplet,'.to.indep.DHS.control.consensus.bed'), quote=F,sep="\t",col.names=F,row.names=F)

     # closestBed--1st closest minus
     
     ##
     closest.1st.minus.triplet.to.peak=bedTools.closest(bed1 = GATA3_peak_summits[,1:3], bed2 =minus.triplet.file, opt.string = '-d -t first')
     write.table(closest.1st.minus.triplet.to.peak,file= paste0('closest.1st.minus.',triplet,'.to.GATA3_without_motifs_123456_78_161bp_mast.bed'), quote=F,sep="\t",col.names=F,row.names=F)
     
     ##
     closest.1st.minus.triplet.to.indep.DHS.control.consensus=bedTools.closest(bed1 = indep.DHS.control.consensus[,1:3], bed2 = minus.triplet.file, opt.string = '-d -t first')
    write.table(closest.1st.minus.triplet.to.indep.DHS.control.consensus,file= paste0('closest.1st.minus.',triplet,'.to.indep.DHS.control.consensus.bed'), quote=F,sep="\t",col.names=F,row.names=F)

  
}

runR.sh

#!/bin/bash
#SBATCH --job-name=runR.sh     # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR.sh_%j.out
#SBATCH -e runR.sh_%j.err

module load R/4.1.2
Rscript 240218_closestBed.R

6.4.1.2 STEP2: remove the closest 3mer coordinates from the whole genome 3mer coordinates on the same strand with bedtools subtract.

#!/bin/bash
#SBATCH --job-name=remove_1st_3mer.sh     # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o remove_1st_3mer.sh_%j.out
#SBATCH -e remove_1st_3mer.sh_%j.err


input_dir1=/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/
input_dir2=/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/GAT_ATC_240218/


prioritized_triplets=("GAT" "ATC")

# Use a for loop to iterate over the 
module load bedtools 
for triplet in "${prioritized_triplets[@]}"
do
  echo $triplet
  
    # plus
    sort -k1,1 -k2,2n ${input_dir1}hg38.3.3.3plus*${triplet}.bed > hg38.3.3.3plus.${triplet}.sorted.bed
    awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.bed  | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed
    bedtools subtract -a hg38.3.3.3plus.${triplet}.sorted.bed -b closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3plus.${triplet}_without_1st_plus_${triplet}_to_GATA3_without_motifs_123456_78_161bp_mast.bed
    rm hg38.3.3.3plus.${triplet}.sorted.bed
    rm closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed
    
    # minus
    sort -k1,1 -k2,2n ${input_dir1}hg38.3.3.3minus*${triplet}.bed > hg38.3.3.3minus.${triplet}.sorted.bed
    awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.minus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.bed  | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.minus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed
    bedtools subtract -a hg38.3.3.3minus.${triplet}.sorted.bed -b closest.1st.minus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3minus.${triplet}_without_1st_minus_${triplet}_to_GATA3_without_motifs_123456_78_161bp_mast.bed
    rm hg38.3.3.3minus.${triplet}.sorted.bed
    rm closest.1st.minus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed

done

# independent DHS control
for triplet in "${prioritized_triplets[@]}"
do
  echo $triplet
  # plus
  sort -k1,1 -k2,2n ${input_dir1}hg38.3.3.3plus*${triplet}.bed > hg38.3.3.3plus.${triplet}.sorted.bed
  awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
  bedtools subtract -a hg38.3.3.3plus.${triplet}.sorted.bed -b closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3plus.36_${triplet}_without_1st_plus_${triplet}_to_indep_DHS_control.bed
   rm hg38.3.3.3plus.${triplet}.sorted.bed
   rm closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
   
  # minus
  sort -k1,1 -k2,2n ${input_dir1}hg38.3.3.3minus*${triplet}.bed > hg38.3.3.3minus.${triplet}.sorted.bed
  awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
  bedtools subtract -a hg38.3.3.3minus.${triplet}.sorted.bed -b closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3minus.36_${triplet}_without_1st_minus_${triplet}_to_indep_DHS_control.bed
   rm hg38.3.3.3minus.${triplet}.sorted.bed
   rm closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
done

6.4.1.3 STEP3

240218_closestBed2.R

#!/usr/bin/env Rscript

Args=commandArgs(TRUE)

bedTools.closest <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
  
  options(scipen =99) # not use scientific notation when writing out
  
  #write bed formatted data.frames to tempfile
  write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
  write.table(bed2,file= 'b.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
  
  # create the command string and call the command using system()
  # the command sort a and b file by coordinates
  command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
  cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
  try(system(command1))
  command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
  cat(command2,"\n")
  try(system(command2))
  
  # the command call closestBed on bed1 and bed2
  command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
  cat(command,"\n")
  try(system(command))
  
  res=read.table('out.file.bed',header=F, comment.char='')
  
  # remove intermediate files
  command3=paste('rm', 'a.file.bed', 'b.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
  cat(command3,"\n")
  try(system(command3))
  
  colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
  return(res)
}


#library(lattice)
#library(latticeExtra)
#library(Biostrings)
library(bigWig)

# List of prioritized triplets
prioritized_triplets <- c("GAT", "ATC")

# List to store non-redundant 6mers
all_6mers <- list()
for (i in 1:length(prioritized_triplets)) {
  for (j in 1:length(prioritized_triplets)) {
    pair <- c(prioritized_triplets[i], prioritized_triplets[j])
    # Combine triplets to form a 6mer
    sixmer <- paste(pair, collapse = "")
    all_6mers <- c(all_6mers, list(sixmer))
    
  }
}

# Create data frame with first 3 bases and last 3 bases
first_3_bases <- substr(all_6mers, 1, 3)
last_3_bases <- substr(all_6mers, 4, 6)
df <- data.frame(First_3_bases = first_3_bases, Last_3_bases = last_3_bases)


# nested loop

dir="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/GAT_ATC_240218/"

win=read.csv(paste0('/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/closest_2nd_other_3mer/pattern_anchor_at_GorC_for_bigWig_pkg.csv'))

# GATA3 peaks with motif1,2,4,5,6
  for (i in 1:nrow(df)){
    pattern1=df[i,1]
    pattern2=df[i,2]
    # anchor position: closest +/- pattern1 relative to G/C
    print(pattern1)
    closest_plus_3mer_to_GATA3_peak_summits=fiveprime.bed(read.table(paste0(dir, "closest.1st.plus.", pattern1, ".to.GATA3_without_motifs_123456_78_161bp_mast.bed"), header=FALSE)[,4:11], upstreamWindow = win[win$pattern==pattern1, "plus_upstream"] , downstreamWindow = win[win$pattern==pattern1, "plus_downstream"])
  
    # query 3mer coordinates on genome (without the overlapped closest 3mer coordinates) - relative to G/C
    print(pattern2)
    plus.3mer.file=fiveprime.bed(read.table(file = paste0(dir, "hg38.3.3.3plus.", pattern2, "_without_1st_plus_", pattern2, "_to_GATA3_without_motifs_123456_78_161bp_mast.bed"), sep="\t", header=FALSE), upstreamWindow = win[win$pattern==pattern2, "plus_upstream"] , downstreamWindow = win[win$pattern==pattern2, "plus_downstream"])
  
  # 2nd closest plus 3mer to closest plus 3mer
  closest.2nd.plus.3mer.to.1st.plus.3mer.GATA3.without.motif=bedTools.closest(bed1 = closest_plus_3mer_to_GATA3_peak_summits[,1:3], bed2 = plus.3mer.file, opt.string = '-d -t first')

  
  write.table(closest.2nd.plus.3mer.to.1st.plus.3mer.GATA3.without.motif, file= paste0("closest.2nd.plus.", pattern2, ".to.1st.plus.", pattern1, ".GATA3_without_motifs_123456_78_161bp_mast.bed"), quote=F,sep="\t",col.names=F,row.names=F)
  }


# DHS regions
for (i in 1:nrow(df)){
    pattern1=df[i,1]
    pattern2=df[i,2]
    # anchor position: closest +/- pattern1 relative to G/C
    print(pattern1)
    closest_plus_3mer_to_DHS=fiveprime.bed(read.table(paste0(dir, "closest.1st.plus.", pattern1, ".to.indep.DHS.control.consensus.bed"), header=FALSE)[,4:11], upstreamWindow = win[win$pattern==pattern1, "plus_upstream"] , downstreamWindow = win[win$pattern==pattern1, "plus_downstream"])
  
    # query 3mer coordinates on genome (without the overlapped closest 3mer coordinates) - relative to G/C
    print(pattern2)
    plus.3mer.file=fiveprime.bed(read.table(file = paste0(dir, "hg38.3.3.3plus.36_", pattern2, "_without_1st_plus_", pattern2, "_to_indep_DHS_control.bed"), sep="\t", header=FALSE), upstreamWindow = win[win$pattern==pattern2, "plus_upstream"] , downstreamWindow = win[win$pattern==pattern2, "plus_downstream"])
  
  # 2nd closest plus 3mer to closest plus 3mer
  closest.2nd.plus.3mer.to.1st.plus.3mer.DHS=bedTools.closest(bed1 = closest_plus_3mer_to_DHS[,1:3], bed2 = plus.3mer.file, opt.string = '-d -t first')

  
  write.table(closest.2nd.plus.3mer.to.1st.plus.3mer.DHS, file= paste0("closest.2nd.plus.", pattern2, ".to.1st.plus.", pattern1, ".indep.DHS.control.bed"), quote=F,sep="\t",col.names=F,row.names=F)
  }

runR.sh

#!/bin/bash
#SBATCH --job-name=runR.sh     # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR.sh_%j.out
#SBATCH -e runR.sh_%j.err

module load R/4.1.2
Rscript 240218_closestBed2.R

coherence check

wc -l closest.2nd.plus.ATC.to.1st.plus.ATC.GATA3_without_motifs_123456_78_161bp_mast.bed
#37308 closest.2nd.plus.ATC.to.1st.plus.ATC.GATA3_without_motifs_123456_78_161bp_mast.bed

wc -l ../without_motifs_123456_78_161bp_mast.bed
#37308 ../without_motifs_123456_78_161bp_mast.bed

Also randomly selected a few coordinates and checked on UCSC genome browser to check the bases.

6.4.1.4 Plots

ls closest.2nd*.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.ATC.to.1st.plus.ATC.GATA3.without.motifs.quantile1.bed
## closest.2nd.plus.ATC.to.1st.plus.ATC.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.2nd.plus.ATC.to.1st.plus.ATC.indep.DHS.control.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.without.motifs.quantile1.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.GAT.to.1st.plus.ATC.GATA3.without.motifs.quantile1.bed
## closest.2nd.plus.GAT.to.1st.plus.ATC.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.2nd.plus.GAT.to.1st.plus.ATC.indep.DHS.control.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.without.motifs.quantile1.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed

density plot
xy plot

GATA3 peaks without motifs:

calculate_actual_frequency <- function(data) {
  # Use table() to create a frequency table
  actual_frequencies <- table(data)/length(data)
  result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
  return(result)
}


df.peak.nomotif = data.frame(matrix(nrow = 0, ncol = 5))     
colnames(df.peak.nomotif) = c("dis","anchor_3mer", "query_3mer","abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path("./closest.2nd.plus*.to.1st.plus.*.GATA3_without_motifs_123456_78_161bp_mast.bed"))) {
    print(closest_2nd_dis)
    anchor_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.*.to.1st.plus.')[[1]][2]), ".GATA3_without_motifs_123456_78_161bp_mast.bed")[[1]][1]
    print(anchor_3mer)
    query_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.')[[1]][2]), ".to.1st.plus.*.GATA3_without_motifs_123456_78_161bp_mast.bed")[[1]][1]
    print(query_3mer)
    temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_3mer, query_3mer)) 
    colnames(temp) = c("dis", "anchor_3mer", "query_3mer")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    actual_frequencies = calculate_actual_frequency(temp$abs.dis)
    temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
    df.peak.nomotif = rbind(df.peak.nomotif,temp1)
}
## [1] "./closest.2nd.plus.ATC.to.1st.plus.ATC.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "ATC"
## [1] "ATC"
## [1] "./closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "GAT"
## [1] "ATC"
## [1] "./closest.2nd.plus.GAT.to.1st.plus.ATC.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "ATC"
## [1] "GAT"
## [1] "./closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "GAT"
## [1] "GAT"
str(df.peak.nomotif)
## 'data.frame':    149232 obs. of  5 variables:
##  $ abs.dis    : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ dis        : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ anchor_3mer: chr  "ATC" "ATC" "ATC" "ATC" ...
##  $ query_3mer : chr  "ATC" "ATC" "ATC" "ATC" ...
##  $ actual_freq: num  0.0143 0.0143 0.0143 0.0143 0.0143 ...
unique(df.peak.nomotif$anchor_3mer)
## [1] "ATC" "GAT"
unique(df.peak.nomotif$query_3mer)
## [1] "ATC" "GAT"
nrow(df.peak.nomotif)
## [1] 149232
head(df.peak.nomotif)
##   abs.dis dis anchor_3mer query_3mer actual_freq
## 1       3   3         ATC        ATC  0.01434009
## 2       3   3         ATC        ATC  0.01434009
## 3       3   3         ATC        ATC  0.01434009
## 4       3   3         ATC        ATC  0.01434009
## 5       3   3         ATC        ATC  0.01434009
## 6       3   3         ATC        ATC  0.01434009
df.peak.nomotif$status="GATA3_peak_without_motifs"

DHS regions (neg ctrl):

df.ctrl = data.frame(matrix(nrow = 0, ncol = 5))     
colnames(df.ctrl) = c("dis","anchor_3mer", "query_3mer","abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path("./closest.2nd.plus*.to.1st.plus.*.indep.DHS.control.bed"))) {
    print(closest_2nd_dis)
    anchor_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.*.to.1st.plus.')[[1]][2]), ".indep.DHS.control.bed")[[1]][1]
    print(anchor_3mer)
    query_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.')[[1]][2]), ".to.1st.plus.*.indep.DHS.control.bed")[[1]][1]
    print(query_3mer)
    temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_3mer, query_3mer)) 
    colnames(temp) = c("dis", "anchor_3mer", "query_3mer")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    actual_frequencies = calculate_actual_frequency(temp$abs.dis)
    temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
    df.ctrl = rbind(df.ctrl,temp1)
}
## [1] "./closest.2nd.plus.AAA.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "AAA"
## [1] "./closest.2nd.plus.AAT.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "AAT"
## [1] "./closest.2nd.plus.AGA.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "AGA"
## [1] "./closest.2nd.plus.ATA.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "ATA"
## [1] "./closest.2nd.plus.ATC.to.1st.plus.ATC.indep.DHS.control.bed"
## [1] "ATC"
## [1] "ATC"
## [1] "./closest.2nd.plus.ATC.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "ATC"
## [1] "./closest.2nd.plus.ATT.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "ATT"
## [1] "./closest.2nd.plus.CTA.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "CTA"
## [1] "./closest.2nd.plus.GAT.to.1st.plus.ATC.indep.DHS.control.bed"
## [1] "ATC"
## [1] "GAT"
## [1] "./closest.2nd.plus.GAT.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "GAT"
## [1] "./closest.2nd.plus.TAA.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "TAA"
## [1] "./closest.2nd.plus.TAG.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "TAG"
## [1] "./closest.2nd.plus.TAT.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "TAT"
## [1] "./closest.2nd.plus.TCT.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "TCT"
## [1] "./closest.2nd.plus.TTA.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "TTA"
## [1] "./closest.2nd.plus.TTT.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "TTT"
str(df.ctrl)
## 'data.frame':    926496 obs. of  5 variables:
##  $ abs.dis    : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ dis        : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ anchor_3mer: chr  "GAT" "GAT" "GAT" "GAT" ...
##  $ query_3mer : chr  "AAA" "AAA" "AAA" "AAA" ...
##  $ actual_freq: num  0.0213 0.0213 0.0213 0.0213 0.0213 ...
unique(df.ctrl$anchor_3mer)
## [1] "GAT" "ATC"
unique(df.ctrl$query_3mer)
##  [1] "AAA" "AAT" "AGA" "ATA" "ATC" "ATT" "CTA" "GAT" "TAA" "TAG" "TAT" "TCT"
## [13] "TTA" "TTT"
nrow(df.ctrl)
## [1] 926496
head(df.ctrl)
##   abs.dis dis anchor_3mer query_3mer actual_freq
## 1       3   3         GAT        AAA  0.02125859
## 2       3   3         GAT        AAA  0.02125859
## 3       3   3         GAT        AAA  0.02125859
## 4       3   3         GAT        AAA  0.02125859
## 5       3   3         GAT        AAA  0.02125859
## 6       3   3         GAT        AAA  0.02125859
df.ctrl$status="MCF7_DHS_regions"
df.plot=rbind(df.peak.nomotif, df.ctrl)
df.plot$pattern = paste0(df.plot$anchor_3mer, "-", df.plot$query_3mer)

df.plot$pattern = factor(df.plot$pattern, levels = c("ATC-ATC", "GAT-ATC", "ATC-GAT", "GAT-GAT"))
df.plot$status = factor(df.plot$status, levels = c("GATA3_peak_without_motifs", "MCF7_DHS_regions"))

nrow(df.plot)
## [1] 1075728
nrow(df.plot[!duplicated(df.plot), ])
## [1] 14862
summary(df.plot[!duplicated(df.plot), ])
##     abs.dis              dis           anchor_3mer         query_3mer       
##  Min.   :     1.0   Min.   :     1.0   Length:14862       Length:14862      
##  1st Qu.:   187.2   1st Qu.:   187.2   Class :character   Class :character  
##  Median :   375.0   Median :   375.0   Mode  :character   Mode  :character  
##  Mean   :   505.1   Mean   :   505.1                                        
##  3rd Qu.:   596.0   3rd Qu.:   596.0                                        
##  Max.   :128938.0   Max.   :128938.0                                        
##   actual_freq                              status         pattern    
##  Min.   :1.727e-05   GATA3_peak_without_motifs: 2610   ATC-ATC:1216  
##  1st Qu.:3.454e-05   MCF7_DHS_regions         :12252   GAT-ATC:1317  
##  Median :1.727e-04                                     ATC-GAT:1280  
##  Mean   :1.346e-03                                     GAT-GAT:1176  
##  3rd Qu.:1.179e-03                                     NA's   :9873  
##  Max.   :2.016e-01
library(lattice)
library(latticeExtra)
pdf('xy_closest_2nd_3mer_to_closest_1st_3mer_to_GATA3_peaks_without_motifs.pdf', width=15,height=5)
print(
  xyplot(actual_freq ~ abs.dis | pattern, 
         #data = df.plot[!duplicated(df.plot), ],
         data = df.plot, 
         groups = status,
         auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.15),
         layout=c(4,1),
         #type = c('p', 'smooth'),
         xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
         ylab="Frequency",
         #main="Independent DHS Regions",
         between=list(y=1.0),
         scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                    panel.densityplot(x, data = df.plot, 
                                                        from=0, 
                                                        to=50, 
                                                        lty = c(1),
                                                        lwd=2, 
                                                        darg=list(bw = "nrd0", kernel="gaussian"),
                                                        type = "count",
                                                        col=c("pink","grey"), ...)
                                    panel.xyplot(x, y, 
                                                   col=c("red","black"), 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      
                                      
  })
)
dev.off()
## quartz_off_screen 
##                 2

We can also demonstrate the relative enrichment by subtracting the actual frequency of GATA3 peaks from that of the DHS regions at each relative distance.

# GATA3 peaks
uniq.df.peak.nomotif=df.peak.nomotif[!duplicated(df.peak.nomotif), 1:5]
  
#DHS regions
uniq.df.ctrl=df.ctrl[!duplicated(df.ctrl), 1:5]
colnames(uniq.df.ctrl)[5]="actual_freq_DHS"
  
#calculate the relative frequency
  #by subtraction of actual frequency between GATA3 peaks and DHS regions
df.plot2=merge(uniq.df.peak.nomotif, uniq.df.ctrl, by=c("abs.dis", "dis", "anchor_3mer", "query_3mer"), all.x = TRUE)
df.plot2$rel_freq <- ifelse(is.na(df.plot2$actual_freq_DHS), NA, df.plot2$actual_freq - df.plot2$actual_freq_DHS)
df.plot2$pattern = paste0(df.plot2$anchor_3mer, "-", df.plot2$query_3mer)
    
df.plot2$pattern = factor(df.plot2$pattern, levels = c("ATC-ATC", "GAT-ATC", "ATC-GAT", "GAT-GAT"))
    
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_2nd_3mer_to_closest_1st_3mer_to_GATA3_peaks_without_motifs_compare_to_DHS.pdf'), width=15,height=5)
print(xyplot(rel_freq ~ abs.dis | pattern, 
         data = df.plot2, 
         #groups = pattern,
         #auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.1),
         layout=c(4,1),
         #type = c('p', 'smooth'),
         xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
         ylab="Relative Frequency (GATA3 peaks - DHS regions)",
         #main=paste0("GATA3 peak with ", motif),
         between=list(y=1.0),
         scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
         #par.settings = list(superpose.line = list(col=c("pink", "skyblue"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 15, 1)), col = "grey90")
                                    panel.densityplot(x, data = df.plot2, 
                                                        from=0, 
                                                        to=50, 
                                                        lty = c(1),
                                                        lwd=2, 
                                                        darg=list(bw = "nrd0", kernel="gaussian"),
                                                        type = "count",
                                                        col="pink", ...)
                                      panel.xyplot(x, y, 
                                                   col="red", 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      
                                      
  })
)
dev.off()
## quartz_off_screen 
##                 2

It seems that these peaks have fewer enriched spaced-3mers; however, they could still be enriched by single sites as GATA binding elements.

6.4.2 GAT/ATC single site

ls closest.1st*.bed
head closest.1st.minus.ATC.to.GATA3_without_motifs_123456_78_161bp_mast.bed
wc -l closest.1st.minus.ATC.to.GATA3_without_motifs_123456_78_161bp_mast.bed
wc -l closest.1st.minus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.1st.minus.ATC.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.1st.minus.ATC.to.GATA3_without_motifs_quantile1.bed
## closest.1st.minus.ATC.to.indep.DHS.control.consensus.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_1.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_2.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_4.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_5.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_6.bed
## closest.1st.minus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.1st.minus.GAT.to.GATA3_without_motifs_quantile1.bed
## closest.1st.minus.GAT.to.indep.DHS.control.consensus.bed
## closest.1st.plus.ATC.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.1st.plus.ATC.to.GATA3_without_motifs_quantile1.bed
## closest.1st.plus.ATC.to.indep.DHS.control.consensus.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_1.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_2.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_4.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_5.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_6.bed
## closest.1st.plus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.1st.plus.GAT.to.GATA3_without_motifs_quantile1.bed
## closest.1st.plus.GAT.to.indep.DHS.control.consensus.bed
## chr1 827380  827381  chr1    827302  827305  36  36  -   ATC 76
## chr1 916769  916770  chr1    916719  916722  36  36  -   ATC 48
## chr1 924853  924854  chr1    924740  924743  36  36  -   ATC 111
## chr1 966653  966654  chr1    966718  966721  36  36  -   ATC 65
## chr1 999508  999509  chr1    999577  999580  36  36  -   ATC 69
## chr1 1000536 1000537 chr1    1000627 1000630 36  36  -   ATC 91
## chr1 1001891 1001892 chr1    1001937 1001940 36  36  -   ATC 46
## chr1 1013265 1013266 chr1    1013247 1013250 36  36  -   ATC 16
## chr1 1013580 1013581 chr1    1013584 1013587 36  36  -   ATC 4
## chr1 1020187 1020188 chr1    1020330 1020333 36  36  -   ATC 143
##    37308 closest.1st.minus.ATC.to.GATA3_without_motifs_123456_78_161bp_mast.bed
##    37308 closest.1st.minus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed

xy plot

GATA3 peaks without motifs:

calculate_actual_frequency <- function(data) {
  # Use table() to create a frequency table
  actual_frequencies <- table(data)/length(data)
  result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
  return(result)
}


df.peak.nomotif = data.frame(matrix(nrow = 0, ncol = 5))     
colnames(df.peak.nomotif) = c("dis","closest_3mer", "abs.dis", "actual_freq")
for (closest_1st_dis in Sys.glob(file.path("./closest.1st.*.to.GATA3_without_motifs_123456_78_161bp_mast.bed"))) {
    print(closest_1st_dis)
    closest_3mer =strsplit((strsplit(strsplit(closest_1st_dis, "/")[[1]][length(strsplit(closest_1st_dis, "/")[[1]])], 'closest.1st.')[[1]][2]), ".to.GATA3_without_motifs_123456_78_161bp_mast.bed")[[1]][1]
    print(closest_3mer)
    
    temp = as.data.frame(cbind(read.table(closest_1st_dis, header=F, comment.char='')[,11], closest_3mer)) 
    colnames(temp) = c("dis", "closest_3mer")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    actual_frequencies = calculate_actual_frequency(temp$abs.dis)
    temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
    df.peak.nomotif = rbind(df.peak.nomotif,temp1)
}
## [1] "./closest.1st.minus.ATC.to.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "minus.ATC"
## [1] "./closest.1st.minus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "minus.GAT"
## [1] "./closest.1st.plus.ATC.to.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "plus.ATC"
## [1] "./closest.1st.plus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "plus.GAT"
str(df.peak.nomotif)
## 'data.frame':    149232 obs. of  4 variables:
##  $ abs.dis     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ dis         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ closest_3mer: chr  "minus.ATC" "minus.ATC" "minus.ATC" "minus.ATC" ...
##  $ actual_freq : num  0.0386 0.0386 0.0386 0.0386 0.0386 ...
unique(df.peak.nomotif$closest_3mer)
## [1] "minus.ATC" "minus.GAT" "plus.ATC"  "plus.GAT"
nrow(df.peak.nomotif)
## [1] 149232
head(df.peak.nomotif)
##   abs.dis dis closest_3mer actual_freq
## 1       0   0    minus.ATC  0.03857082
## 2       0   0    minus.ATC  0.03857082
## 3       0   0    minus.ATC  0.03857082
## 4       0   0    minus.ATC  0.03857082
## 5       0   0    minus.ATC  0.03857082
## 6       0   0    minus.ATC  0.03857082
df.peak.nomotif$status="GATA3_peak_without_motifs"

DHS regions (neg ctrl):

df.ctrl = data.frame(matrix(nrow = 0, ncol = 5))     
colnames(df.ctrl) = c("dis","closest_3mer", "abs.dis", "actual_freq")
for (closest_1st_dis in Sys.glob(file.path("./closest.1st.*.to.indep.DHS.control.consensus.bed"))) {
    print(closest_1st_dis)
    closest_3mer =strsplit((strsplit(strsplit(closest_1st_dis, "/")[[1]][length(strsplit(closest_1st_dis, "/")[[1]])], 'closest.1st.')[[1]][2]), ".to.indep.DHS.control.consensus.bed")[[1]][1]
    print(closest_3mer)
  
    temp = as.data.frame(cbind(read.table(closest_1st_dis, header=F, comment.char='')[,11], closest_3mer)) 
    colnames(temp) = c("dis", "closest_3mer")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    actual_frequencies = calculate_actual_frequency(temp$abs.dis)
    temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
    df.ctrl = rbind(df.ctrl,temp1)
}
## [1] "./closest.1st.minus.ATC.to.indep.DHS.control.consensus.bed"
## [1] "minus.ATC"
## [1] "./closest.1st.minus.GAT.to.indep.DHS.control.consensus.bed"
## [1] "minus.GAT"
## [1] "./closest.1st.plus.ATC.to.indep.DHS.control.consensus.bed"
## [1] "plus.ATC"
## [1] "./closest.1st.plus.GAT.to.indep.DHS.control.consensus.bed"
## [1] "plus.GAT"
str(df.ctrl)
## 'data.frame':    231624 obs. of  4 variables:
##  $ abs.dis     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ dis         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ closest_3mer: chr  "minus.ATC" "minus.ATC" "minus.ATC" "minus.ATC" ...
##  $ actual_freq : num  0.0262 0.0262 0.0262 0.0262 0.0262 ...
unique(df.ctrl$closest_3mer)
## [1] "minus.ATC" "minus.GAT" "plus.ATC"  "plus.GAT"
nrow(df.ctrl)
## [1] 231624
head(df.ctrl)
##   abs.dis dis closest_3mer actual_freq
## 1       0   0    minus.ATC   0.0262149
## 2       0   0    minus.ATC   0.0262149
## 3       0   0    minus.ATC   0.0262149
## 4       0   0    minus.ATC   0.0262149
## 5       0   0    minus.ATC   0.0262149
## 6       0   0    minus.ATC   0.0262149
df.ctrl$status="MCF7_DHS_regions"
df.plot=rbind(df.peak.nomotif, df.ctrl)

df.plot$closest_3mer = factor(df.plot$closest_3mer, levels = c("plus.GAT", "minus.ATC" , "minus.GAT", "plus.ATC"))
df.plot$status = factor(df.plot$status, levels = c("GATA3_peak_without_motifs", "MCF7_DHS_regions"))

nrow(df.plot)
## [1] 380856
nrow(df.plot[!duplicated(df.plot), ])
## [1] 8545
summary(df.plot[!duplicated(df.plot), ])
##     abs.dis            dis            closest_3mer   actual_freq       
##  Min.   :     0   Min.   :     0   plus.GAT :2124   Min.   :1.727e-05  
##  1st Qu.:   267   1st Qu.:   267   minus.ATC:2141   1st Qu.:1.727e-05  
##  Median :  1028   Median :  1028   minus.GAT:2151   Median :1.727e-05  
##  Mean   : 28226   Mean   : 28226   plus.ATC :2129   Mean   :9.362e-04  
##  3rd Qu.: 19605   3rd Qu.: 19605                    3rd Qu.:1.900e-04  
##  Max.   :890240   Max.   :890240                    Max.   :3.860e-02  
##                        status    
##  GATA3_peak_without_motifs:2454  
##  MCF7_DHS_regions         :6091  
##                                  
##                                  
##                                  
## 
library(lattice)
library(latticeExtra)
pdf('xy_closest_1st_3mer_to_peaks_without_motifs.pdf', width=15,height=5)
print(
  xyplot(actual_freq ~ abs.dis | closest_3mer, 
         #data = df.plot[!duplicated(df.plot), ],
         data = df.plot, 
         groups = status,
         auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.15),
         layout=c(4,1),
         #type = c('p', 'smooth'),
         xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
         ylab="Frequency",
         #main="Independent DHS Regions",
         between=list(y=1.0),
         scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                    panel.densityplot(x, data = df.plot, 
                                                        from=0, 
                                                        to=50, 
                                                        lty = c(1),
                                                        lwd=2, 
                                                        darg=list(bw = "nrd0", kernel="gaussian"),
                                                        type = "count",
                                                        col=c("pink","grey"), ...)
                                    panel.xyplot(x, y, 
                                                   col=c("red","black"), 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      
                                      
  })
)
dev.off()
## quartz_off_screen 
##                 2

Again, we can demonstrate the relative enrichment of closest 3mer by subtracting the actual frequency of GATA3 peaks from that of the DHS regions at each relative distance.

# GATA3 peaks
uniq.df.peak.nomotif=df.peak.nomotif[!duplicated(df.peak.nomotif), 1:4]
  
#DHS regions
uniq.df.ctrl=df.ctrl[!duplicated(df.ctrl), 1:4]
colnames(uniq.df.ctrl)[4]="actual_freq_DHS"
  
#calculate the relative frequency
  #by subtraction of actual frequency between GATA3 peaks and DHS regions
df.plot2=merge(uniq.df.peak.nomotif, uniq.df.ctrl, by=c("abs.dis", "dis", "closest_3mer"), all.x = TRUE)
df.plot2$rel_freq <- ifelse(is.na(df.plot2$actual_freq_DHS), NA, df.plot2$actual_freq - df.plot2$actual_freq_DHS)
df.plot2$closest_3mer = factor(df.plot2$closest_3mer, levels = c("plus.GAT", "minus.ATC" , "minus.GAT", "plus.ATC"))

    
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_1st_3mer_GATA3_peaks_without_motifs_compare_to_DHS.pdf'), width=15,height=5)
print(xyplot(rel_freq ~ abs.dis | closest_3mer, 
         data = df.plot2, 
         #groups = pattern,
         #auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.1),
         layout=c(4,1),
         #type = c('p', 'smooth'),
         xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
         ylab="Relative Frequency (GATA3 peaks - DHS regions)",
         #main=paste0("GATA3 peak with ", motif),
         between=list(y=1.0),
         scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
         #par.settings = list(superpose.line = list(col=c("pink", "skyblue"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(0, 15, 1)), col = "grey90")
                                    panel.densityplot(x, data = df.plot2, 
                                                        from=0, 
                                                        to=50, 
                                                        lty = c(1),
                                                        lwd=2, 
                                                        darg=list(bw = "nrd0", kernel="gaussian"),
                                                        type = "count",
                                                        col="pink", ...)
                                      panel.xyplot(x, y, 
                                                   col="red", 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      
                                      
  })
)
dev.off()
## quartz_off_screen 
##                 2

We can observe the enrichment, but the frequency of this enrichment is low. This could be attributed to some low-intensity peaks that are not bound by GATA3; they were labeled as GATA3 peaks because the MACS3 peak calling software identified peaks based on the p-value we set. These peaks could saturate the enrichment frequency because they increase the denominator when calculating the frequency.

When we divide these GATA3 peaks (without motifs) into five quantiles based on their intensity, it becomes evident that peaks with higher intensity also exhibit stronger enrichment in observing GAT/ATC close to the peak summit.

6.5 GATA3 peaks without motifs–quantiles

We have divided the GATA3 peaks without motifs into 5 quantiles based on the ranked intensity in January_updates 3.2.2. We have previously made CDF plots to visualize the enrichment.

quantile0.2_summits.bed
quantile0.6_summits.bed
quantile1_summits.bed
quantile0.4_summits.bed
quantile0.8_summits.bed

6.5.0.1 STEP1:

240218_closestBed3.R:

(cd /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/GAT_ATC_240218/quantiles)

#!/usr/bin/env Rscript

Args=commandArgs(TRUE)
# closestBed function
bedTools.closest <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
  
  options(scipen =99) # not use scientific notation when writing out
  
  #write bed formatted data.frames to tempfile
  write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
  write.table(bed2,file= 'b.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
  
  # create the command string and call the command using system()
  # the command sort a and b file by coordinates
  command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
  cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
  try(system(command1))
  command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
  cat(command2,"\n")
  try(system(command2))
  
  # the command call closestBed on bed1 and bed2
  command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
  cat(command,"\n")
  try(system(command))
  
  res=read.table('out.file.bed',header=F, comment.char='')
  
  # remove intermediate files
  command3=paste('rm', 'a.file.bed', 'b.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
  cat(command3,"\n")
  try(system(command3))
  
  colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
  return(res)
}

dir1="/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/"
dir2="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/"
#dir3="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/ENCODE_DHS_GSE29692/"

prioritized_triplets = c("GAT", "ATC")
library(bigWig)

quantiles = c("quantile1", "quantile0.8","quantile0.6","quantile0.4","quantile0.2")

  for (triplet in prioritized_triplets){
  print(triplet)
  # 3mer genome coordinates
  plus.triplet.file=read.table(file = Sys.glob(file.path(paste0(dir1,"hg38.3.3.3plus.*_",triplet, ".bed"))), sep="\t", header=FALSE)
  minus.triplet.file=read.table(file = Sys.glob(file.path(paste0(dir1,"hg38.3.3.3minus.*_",triplet ,".bed"))), sep="\t", header=FALSE)

  for (quantile in quantiles) {
    print(quantile)
    # peak summits
    GATA3_peak_summits=center.bed(read.table(paste0(dir2, quantile, "_summits.bed"), header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
   
    # closestBed--1st closest plus
    closest.1st.plus.triplet.to.peak=bedTools.closest(bed1 = GATA3_peak_summits[,1:3], bed2 = plus.triplet.file, opt.string = '-d -t first')
    write.table(closest.1st.plus.triplet.to.peak,file= paste0('closest.1st.plus.',triplet,'.to.GATA3_without_motifs_', quantile, '.bed'), quote=F,sep="\t",col.names=F,row.names=F)
    
     # closestBed--1st closest minus
     closest.1st.minus.triplet.to.peak=bedTools.closest(bed1 = GATA3_peak_summits[,1:3], bed2 =minus.triplet.file, opt.string = '-d -t first')
     write.table(closest.1st.minus.triplet.to.peak,file= paste0('closest.1st.minus.',triplet,'.to.GATA3_without_motifs_', quantile, '.bed'), quote=F,sep="\t",col.names=F,row.names=F)
     
  }
}

runR.sh

#!/bin/bash
#SBATCH --job-name=runR.sh     # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR.sh_%j.out
#SBATCH -e runR.sh_%j.err

module load R/4.1.2
Rscript 240218_closestBed3.R

6.5.0.2 STEP2: remove the closest 3mer coordinates from the whole genome 3mer coordinates on the same strand with bedtools subtract.

#!/bin/bash
#SBATCH --job-name=remove_1st_3mer.sh     # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o remove_1st_3mer.sh_%j.out
#SBATCH -e remove_1st_3mer.sh_%j.err


input_dir1=/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/
input_dir2=/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/GAT_ATC_240218/quantiles/


prioritized_triplets=("GAT" "ATC")
quantiles=("quantile1" "quantile0.8" "quantile0.6" "quantile0.4" "quantile0.2")


# Use a for loop to iterate over the list
module load bedtools 
for triplet in "${prioritized_triplets[@]}"
do
  echo $triplet
  
  for quantile in "${quantiles[@]}"
  do
    echo $quantile
    # plus
    sort -k1,1 -k2,2n ${input_dir1}hg38.3.3.3plus*${triplet}.bed > hg38.3.3.3plus.${triplet}.sorted.bed
    awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.plus.${triplet}.to.GATA3_without_motifs_${quantile}.bed  | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.plus.${triplet}.to.GATA3_without_motifs_${quantile}.uniq.sorted.bed
    bedtools subtract -a hg38.3.3.3plus.${triplet}.sorted.bed -b closest.1st.plus.${triplet}.to.GATA3_without_motifs_${quantile}.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3plus.${triplet}_without_1st_plus_${triplet}_to_GATA3_without_motifs_${quantile}.bed
    rm hg38.3.3.3plus.${triplet}.sorted.bed
    rm closest.1st.plus.${triplet}.to.GATA3_without_motifs_${quantile}.uniq.sorted.bed
    
    # minus
    sort -k1,1 -k2,2n ${input_dir1}hg38.3.3.3minus*${triplet}.bed > hg38.3.3.3minus.${triplet}.sorted.bed
    awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.minus.${triplet}.to.GATA3_without_motifs_${quantile}.bed  | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.minus.${triplet}.to.GATA3_without_motifs_${quantile}.uniq.sorted.bed
    bedtools subtract -a hg38.3.3.3minus.${triplet}.sorted.bed -b closest.1st.minus.${triplet}.to.GATA3_without_motifs_${quantile}.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3minus.${triplet}_without_1st_minus_${triplet}_to_GATA3_without_motifs_${quantile}.bed
    rm hg38.3.3.3minus.${triplet}.sorted.bed
    rm closest.1st.minus.${triplet}.to.GATA3_without_motifs_${quantile}.uniq.sorted.bed
  done
done

6.5.0.3 STEP3

240218_closestBed4.R

#!/usr/bin/env Rscript

Args=commandArgs(TRUE)

bedTools.closest <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
  
  options(scipen =99) # not use scientific notation when writing out
  
  #write bed formatted data.frames to tempfile
  write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
  write.table(bed2,file= 'b.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
  
  # create the command string and call the command using system()
  # the command sort a and b file by coordinates
  command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
  cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
  try(system(command1))
  command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
  cat(command2,"\n")
  try(system(command2))
  
  # the command call closestBed on bed1 and bed2
  command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
  cat(command,"\n")
  try(system(command))
  
  res=read.table('out.file.bed',header=F, comment.char='')
  
  # remove intermediate files
  command3=paste('rm', 'a.file.bed', 'b.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
  cat(command3,"\n")
  try(system(command3))
  
  colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
  return(res)
}


#library(lattice)
#library(latticeExtra)
#library(Biostrings)
library(bigWig)

# List of prioritized triplets
prioritized_triplets <- c("GAT", "ATC")

# List to store non-redundant 6mers
all_6mers <- list()
for (i in 1:length(prioritized_triplets)) {
  for (j in 1:length(prioritized_triplets)) {
    pair <- c(prioritized_triplets[i], prioritized_triplets[j])
    # Combine triplets to form a 6mer
    sixmer <- paste(pair, collapse = "")
    all_6mers <- c(all_6mers, list(sixmer))
    
  }
}

# Create data frame with first 3 bases and last 3 bases
first_3_bases <- substr(all_6mers, 1, 3)
last_3_bases <- substr(all_6mers, 4, 6)
df <- data.frame(First_3_bases = first_3_bases, Last_3_bases = last_3_bases)


# nested loop
dir="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/GAT_ATC_240218/quantiles/"
quantiles = c("quantile1", "quantile0.8","quantile0.6","quantile0.4","quantile0.2")
win=read.csv(paste0('/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/closest_2nd_other_3mer/pattern_anchor_at_GorC_for_bigWig_pkg.csv'))

# GATA3 peaks without motifs
for (quantile in quantiles) {
  print(quantile)
  for (i in 1:nrow(df)){
    pattern1=df[i,1]
    pattern2=df[i,2]
    # anchor position: closest +/- pattern1 relative to G/C
    print(pattern1)
    closest_plus_3mer_to_GATA3_peak_summits=fiveprime.bed(read.table(paste0(dir, "closest.1st.plus.", pattern1, ".to.GATA3_without_motifs_", quantile, ".bed"), header=FALSE)[,4:11], upstreamWindow = win[win$pattern==pattern1, "plus_upstream"] , downstreamWindow = win[win$pattern==pattern1, "plus_downstream"])
  
    # query 3mer coordinates on genome (without the overlapped closest 3mer coordinates) - relative to G/C
    print(pattern2)
    plus.3mer.file=fiveprime.bed(read.table(file = paste0(dir, "hg38.3.3.3plus.", pattern2, "_without_1st_plus_", pattern2, "_to_GATA3_without_motifs_", quantile, ".bed"), sep="\t", header=FALSE), upstreamWindow = win[win$pattern==pattern2, "plus_upstream"] , downstreamWindow = win[win$pattern==pattern2, "plus_downstream"])
  
  # 2nd closest plus 3mer to closest plus 3mer
  closest.2nd.plus.3mer.to.1st.plus.3mer.GATA3.without.motif=bedTools.closest(bed1 = closest_plus_3mer_to_GATA3_peak_summits[,1:3], bed2 = plus.3mer.file, opt.string = '-d -t first')

  
  write.table(closest.2nd.plus.3mer.to.1st.plus.3mer.GATA3.without.motif, file= paste0("closest.2nd.plus.", pattern2, ".to.1st.plus.", pattern1, ".GATA3.without.motifs.", quantile, ".bed"), quote=F,sep="\t",col.names=F,row.names=F)
  }
}

runR.sh

#!/bin/bash
#SBATCH --job-name=runR.sh     # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR.sh_%j.out
#SBATCH -e runR.sh_%j.err

module load R/4.1.2
Rscript 240218_closestBed4.R

6.5.0.4 spaced 3mer plots

CDF plots already made.

xy plots
GATA3 peaks without motifs in quantile 1:

calculate_actual_frequency <- function(data) {
  # Use table() to create a frequency table
  actual_frequencies <- table(data)/length(data)
  result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
  return(result)
}


df.peak.nomotif = data.frame(matrix(nrow = 0, ncol = 5))     
colnames(df.peak.nomotif) = c("dis","anchor_3mer", "query_3mer","abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path("./closest.2nd.plus*.to.1st.plus.*.GATA3.without.motifs.quantile1.bed"))) {
    print(closest_2nd_dis)
    anchor_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.*.to.1st.plus.')[[1]][2]), ".GATA3.without.motifs.quantile1.bed")[[1]][1]
    print(anchor_3mer)
    query_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.')[[1]][2]), ".to.1st.plus.*.GATA3.without.motifs.quantile1.bed")[[1]][1]
    print(query_3mer)
    temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_3mer, query_3mer)) 
    colnames(temp) = c("dis", "anchor_3mer", "query_3mer")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    actual_frequencies = calculate_actual_frequency(temp$abs.dis)
    temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
    df.peak.nomotif = rbind(df.peak.nomotif,temp1)
}

str(df.peak.nomotif)

unique(df.peak.nomotif$anchor_3mer)
unique(df.peak.nomotif$query_3mer)
nrow(df.peak.nomotif)
head(df.peak.nomotif)

df.peak.nomotif$status="GATA3_peak_without_motifs_quantile1"

DHS regions (neg ctrl):

df.ctrl = data.frame(matrix(nrow = 0, ncol = 5))     
colnames(df.ctrl) = c("dis","anchor_3mer", "query_3mer","abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path("./closest.2nd.plus*.to.1st.plus.*.indep.DHS.control.bed"))) {
    print(closest_2nd_dis)
    anchor_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.*.to.1st.plus.')[[1]][2]), ".indep.DHS.control.bed")[[1]][1]
    print(anchor_3mer)
    query_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.')[[1]][2]), ".to.1st.plus.*.indep.DHS.control.bed")[[1]][1]
    print(query_3mer)
    temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_3mer, query_3mer)) 
    colnames(temp) = c("dis", "anchor_3mer", "query_3mer")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    actual_frequencies = calculate_actual_frequency(temp$abs.dis)
    temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
    df.ctrl = rbind(df.ctrl,temp1)
}

str(df.ctrl)

unique(df.ctrl$anchor_3mer)
unique(df.ctrl$query_3mer)
nrow(df.ctrl)
head(df.ctrl)
df.ctrl$status="MCF7_DHS_regions"
df.plot=rbind(df.peak.nomotif, df.ctrl)
df.plot$pattern = paste0(df.plot$anchor_3mer, "-", df.plot$query_3mer)

df.plot$pattern = factor(df.plot$pattern, levels = c("ATC-ATC", "GAT-ATC", "ATC-GAT", "GAT-GAT"))
df.plot$status = factor(df.plot$status, levels = c("GATA3_peak_without_motifs_quantile1", "MCF7_DHS_regions"))

nrow(df.plot)
nrow(df.plot[!duplicated(df.plot), ])
summary(df.plot[!duplicated(df.plot), ])
library(lattice)
library(latticeExtra)
pdf('xy_closest_2nd_3mer_to_closest_1st_3mer_to_GATA3_peaks_without_motifs_quantile1.pdf', width=15,height=5)
print(
  xyplot(actual_freq ~ abs.dis | pattern, 
         #data = df.plot[!duplicated(df.plot), ],
         data = df.plot, 
         groups = status,
         auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.15),
         layout=c(4,1),
         #type = c('p', 'smooth'),
         xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
         ylab="Frequency",
         #main="Independent DHS Regions",
         between=list(y=1.0),
         scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                    panel.densityplot(x, data = df.plot, 
                                                        from=0, 
                                                        to=50, 
                                                        lty = c(1),
                                                        lwd=2, 
                                                        darg=list(bw = "nrd0", kernel="gaussian"),
                                                        type = "count",
                                                        col=c("pink","grey"), ...)
                                    panel.xyplot(x, y, 
                                                   col=c("red","black"), 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      
                                      
  })
)
dev.off()

Relative enrichment by subtracting the actual frequency of GATA3 peaks from that of the DHS regions at each relative distance.

# GATA3 peaks
uniq.df.peak.nomotif=df.peak.nomotif[!duplicated(df.peak.nomotif), 1:5]
  
#DHS regions
uniq.df.ctrl=df.ctrl[!duplicated(df.ctrl), 1:5]
colnames(uniq.df.ctrl)[5]="actual_freq_DHS"
  
#calculate the relative frequency
  #by subtraction of actual frequency between GATA3 peaks and DHS regions
df.plot2=merge(uniq.df.peak.nomotif, uniq.df.ctrl, by=c("abs.dis", "dis", "anchor_3mer", "query_3mer"), all.x = TRUE)
df.plot2$rel_freq <- ifelse(is.na(df.plot2$actual_freq_DHS), NA, df.plot2$actual_freq - df.plot2$actual_freq_DHS)
df.plot2$pattern = paste0(df.plot2$anchor_3mer, "-", df.plot2$query_3mer)
    
df.plot2$pattern = factor(df.plot2$pattern, levels = c("ATC-ATC", "GAT-ATC", "ATC-GAT", "GAT-GAT"))
    
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_2nd_3mer_to_closest_1st_3mer_to_GATA3_peaks_without_motifs_quantile1_compare_to_DHS.pdf'), width=15,height=5)
print(xyplot(rel_freq ~ abs.dis | pattern, 
         data = df.plot2, 
         #groups = pattern,
         #auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.1),
         layout=c(4,1),
         #type = c('p', 'smooth'),
         xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
         ylab="Relative Frequency (GATA3 peaks - DHS regions)",
         #main=paste0("GATA3 peak with ", motif),
         between=list(y=1.0),
         scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
         #par.settings = list(superpose.line = list(col=c("pink", "skyblue"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 15, 1)), col = "grey90")
                                    panel.densityplot(x, data = df.plot2, 
                                                        from=0, 
                                                        to=50, 
                                                        lty = c(1),
                                                        lwd=2, 
                                                        darg=list(bw = "nrd0", kernel="gaussian"),
                                                        type = "count",
                                                        col="pink", ...)
                                      panel.xyplot(x, y, 
                                                   col="red", 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      panel.abline(h=0.04, col = "red")
                                      
  })
)
dev.off()

GATA3 peaks without motifs in other quantile:
DHS regions (neg ctrl):

calculate_actual_frequency <- function(data) {
  # Use table() to create a frequency table
  actual_frequencies <- table(data)/length(data)
  result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
  return(result)
}
df.ctrl = data.frame(matrix(nrow = 0, ncol = 5))     
colnames(df.ctrl) = c("dis","anchor_3mer", "query_3mer","abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path("../closest.2nd.plus*.to.1st.plus.*.indep.DHS.control.bed"))) {
    print(closest_2nd_dis)
    anchor_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.*.to.1st.plus.')[[1]][2]), ".indep.DHS.control.bed")[[1]][1]
    print(anchor_3mer)
    query_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.')[[1]][2]), ".to.1st.plus.*.indep.DHS.control.bed")[[1]][1]
    print(query_3mer)
    temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_3mer, query_3mer)) 
    colnames(temp) = c("dis", "anchor_3mer", "query_3mer")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    actual_frequencies = calculate_actual_frequency(temp$abs.dis)
    temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
    df.ctrl = rbind(df.ctrl,temp1)
}

str(df.ctrl)

unique(df.ctrl$anchor_3mer)
unique(df.ctrl$query_3mer)
nrow(df.ctrl)
head(df.ctrl)
df.ctrl$status="MCF7_DHS_regions"
quantiles=c("quantile0.8", "quantile0.6", "quantile0.4", "quantile0.2")

for (quantile in quantiles){
  print(quantile)
  df.peak.nomotif = data.frame(matrix(nrow = 0, ncol = 5))     
  colnames(df.peak.nomotif) = c("dis","anchor_3mer", "query_3mer","abs.dis", "actual_freq")
  for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd.plus*.to.1st.plus.*.GATA3.without.motifs.", quantile, ".bed")))) {
    print(closest_2nd_dis)
    anchor_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.*.to.1st.plus.')[[1]][2]), paste0(".GATA3.without.motifs.",quantile,".bed"))[[1]][1]
    print(anchor_3mer)
    query_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.')[[1]][2]), paste0(".to.1st.plus.", anchor_3mer, ".GATA3.without.motifs.",quantile,".bed"))[[1]][1]
    print(query_3mer)
    temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_3mer, query_3mer)) 
    colnames(temp) = c("dis", "anchor_3mer", "query_3mer")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    actual_frequencies = calculate_actual_frequency(temp$abs.dis)
    temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
    df.peak.nomotif = rbind(df.peak.nomotif,temp1)
}
 df.peak.nomotif$status=paste0("GATA3_peak_without_motifs_", quantile)
 df.plot=rbind(df.peak.nomotif, df.ctrl)
 df.plot$pattern = paste0(df.plot$anchor_3mer, "-", df.plot$query_3mer)

 df.plot$pattern = factor(df.plot$pattern, levels = c("ATC-ATC", "GAT-ATC", "ATC-GAT", "GAT-GAT"))
 df.plot$status = factor(df.plot$status, levels = c(paste0("GATA3_peak_without_motifs_", quantile), "MCF7_DHS_regions"))
 
 library(lattice)
 library(latticeExtra)
 pdf(paste0('xy_closest_2nd_3mer_to_closest_1st_3mer_to_GATA3_peaks_without_motifs_', quantile, '.pdf'), width=15,height=5)
 print(
  xyplot(actual_freq ~ abs.dis | pattern, 
         #data = df.plot[!duplicated(df.plot), ],
         data = df.plot, 
         groups = status,
         auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.15),
         layout=c(4,1),
         #type = c('p', 'smooth'),
         xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
         ylab="Frequency",
         main=paste0("GATA3 peak without motifs ", quantile),
         between=list(y=1.0),
         scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                    panel.densityplot(x, data = df.plot, 
                                                        from=0, 
                                                        to=50, 
                                                        lty = c(1),
                                                        lwd=2, 
                                                        darg=list(bw = "nrd0", kernel="gaussian"),
                                                        type = "count",
                                                        col=c("pink","grey"), ...)
                                    panel.xyplot(x, y, 
                                                   col=c("red","black"), 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      
                                      
  })
)
dev.off()
 
}

6.5.0.5 single 3mer plots

xy plot

GATA3 peaks without motifs:

calculate_actual_frequency <- function(data) {
  # Use table() to create a frequency table
  actual_frequencies <- table(data)/length(data)
  result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
  return(result)
}


df.peak.nomotif = data.frame(matrix(nrow = 0, ncol = 5))     
colnames(df.peak.nomotif) = c("dis","closest_3mer", "abs.dis", "actual_freq")
for (closest_1st_dis in Sys.glob(file.path("./closest.1st.*.to.GATA3_without_motifs_quantile1.bed"))) {
    print(closest_1st_dis)
    closest_3mer =strsplit((strsplit(strsplit(closest_1st_dis, "/")[[1]][length(strsplit(closest_1st_dis, "/")[[1]])], 'closest.1st.')[[1]][2]), ".to.GATA3_without_motifs_quantile1.bed")[[1]][1]
    print(closest_3mer)
    
    temp = as.data.frame(cbind(read.table(closest_1st_dis, header=F, comment.char='')[,11], closest_3mer)) 
    colnames(temp) = c("dis", "closest_3mer")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    actual_frequencies = calculate_actual_frequency(temp$abs.dis)
    temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
    df.peak.nomotif = rbind(df.peak.nomotif,temp1)
}

str(df.peak.nomotif)

unique(df.peak.nomotif$closest_3mer)
nrow(df.peak.nomotif)
head(df.peak.nomotif)

df.peak.nomotif$status="GATA3_peak_without_motifs_quantile1"

DHS regions (neg ctrl):

df.ctrl = data.frame(matrix(nrow = 0, ncol = 5))     
colnames(df.ctrl) = c("dis","closest_3mer", "abs.dis", "actual_freq")
for (closest_1st_dis in Sys.glob(file.path("./closest.1st.*.to.indep.DHS.control.consensus.bed"))) {
    print(closest_1st_dis)
    closest_3mer =strsplit((strsplit(strsplit(closest_1st_dis, "/")[[1]][length(strsplit(closest_1st_dis, "/")[[1]])], 'closest.1st.')[[1]][2]), ".to.indep.DHS.control.consensus.bed")[[1]][1]
    print(closest_3mer)
  
    temp = as.data.frame(cbind(read.table(closest_1st_dis, header=F, comment.char='')[,11], closest_3mer)) 
    colnames(temp) = c("dis", "closest_3mer")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    actual_frequencies = calculate_actual_frequency(temp$abs.dis)
    temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
    df.ctrl = rbind(df.ctrl,temp1)
}

str(df.ctrl)

unique(df.ctrl$closest_3mer)
nrow(df.ctrl)
head(df.ctrl)
df.ctrl$status="MCF7_DHS_regions"
df.plot=rbind(df.peak.nomotif, df.ctrl)

df.plot$closest_3mer = factor(df.plot$closest_3mer, levels = c("plus.GAT", "minus.ATC" , "minus.GAT", "plus.ATC"))
df.plot$status = factor(df.plot$status, levels = c("GATA3_peak_without_motifs_quantile1", "MCF7_DHS_regions"))

nrow(df.plot)
nrow(df.plot[!duplicated(df.plot), ])
summary(df.plot[!duplicated(df.plot), ])
library(lattice)
library(latticeExtra)
pdf('xy_closest_1st_3mer_to_peaks_without_motifs_quantile1.pdf', width=15,height=5)
print(
  xyplot(actual_freq ~ abs.dis | closest_3mer, 
         #data = df.plot[!duplicated(df.plot), ],
         data = df.plot, 
         groups = status,
         auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.15),
         layout=c(4,1),
         #type = c('p', 'smooth'),
         xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
         ylab="Frequency",
         #main="Independent DHS Regions",
         between=list(y=1.0),
         scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
         par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
                                    panel.densityplot(x, data = df.plot, 
                                                        from=0, 
                                                        to=50, 
                                                        lty = c(1),
                                                        lwd=2, 
                                                        darg=list(bw = "nrd0", kernel="gaussian"),
                                                        type = "count",
                                                        col=c("pink","grey"), ...)
                                    panel.xyplot(x, y, 
                                                   col=c("red","black"), 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      
                                      
  })
)
dev.off()

Again, we can demonstrate the relative enrichment of closest 3mer by subtracting the actual frequency of GATA3 peaks from that of the DHS regions at each relative distance.

# GATA3 peaks
uniq.df.peak.nomotif=df.peak.nomotif[!duplicated(df.peak.nomotif), 1:4]
  
#DHS regions
uniq.df.ctrl=df.ctrl[!duplicated(df.ctrl), 1:4]
colnames(uniq.df.ctrl)[4]="actual_freq_DHS"
  
#calculate the relative frequency
  #by subtraction of actual frequency between GATA3 peaks and DHS regions
df.plot2=merge(uniq.df.peak.nomotif, uniq.df.ctrl, by=c("abs.dis", "dis", "closest_3mer"), all.x = TRUE)
df.plot2$rel_freq <- ifelse(is.na(df.plot2$actual_freq_DHS), NA, df.plot2$actual_freq - df.plot2$actual_freq_DHS)
df.plot2$closest_3mer = factor(df.plot2$closest_3mer, levels = c("plus.GAT", "minus.ATC" , "minus.GAT", "plus.ATC"))

    
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_1st_3mer_GATA3_peaks_without_motifs_quantile1_compare_to_DHS.pdf'), width=15,height=5)
print(xyplot(rel_freq ~ abs.dis | closest_3mer, 
         data = df.plot2, 
         #groups = pattern,
         #auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
         aspect = 1,
         xlim=c(0,50),
         ylim=c(0, 0.1),
         layout=c(4,1),
         #type = c('p', 'smooth'),
         xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
         ylab="Relative Frequency (GATA3 peaks - DHS regions)",
         #main=paste0("GATA3 peak with ", motif),
         between=list(y=1.0),
         scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
         #par.settings = list(superpose.line = list(col=c("pink", "skyblue"), lwd=2), strip.background=list(col="grey85")),
         panel = function(x,y,...) {panel.abline(v=c(seq(0, 15, 1)), col = "grey90")
                                    panel.densityplot(x, data = df.plot2, 
                                                        from=0, 
                                                        to=50, 
                                                        lty = c(1),
                                                        lwd=2, 
                                                        darg=list(bw = "nrd0", kernel="gaussian"),
                                                        type = "count",
                                                        col="pink", ...)
                                      panel.xyplot(x, y, 
                                                   col="red", 
                                                   pch=18, 
                                                   cex=0.6,...)
                                      
                                      
  })
)
dev.off()

6.6 RSAT for GATA3 peaks without motifs

Input fasta: GATA3 peaks without motifs 12345678:

(cd /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/RSAT/peak_161_without_motifs_12345678)

module load bedtools
genome=/home/FCAM/ssun/Genome/hg38.fa
dir=/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/
fastaFromBed -fi $genome -bed ${dir}without_motifs_123456_78_161bp_mast.bed -fo without_motifs_123456_78_161bp_mast.fasta

Use the previously made background file.

source ~/miniconda3/bin/activate
conda activate rsat
rsat dyad-analysis -o GATA3_peaks_without_motifs_123456_78_161win_mast_RSAT_dyad.txt -i without_motifs_123456_78_161bp_mast.fasta -format FastA -l 3 -sp 0-20 -expfreq ../ENCODE.MCF7.DHS.background4.txt -return exp_occ,occ,ratio -sort -seqtype dna 


# -1str single strand count; only the direct strand is considered for oligonucleotide and dyad occurrence counting.
# -2str count on oth strands
        #The occurrences of each oligonucleotide are summed on both strands. This allows to detect elements which act in an orientation-insensitive way (as is generally the case for yeast upstream elements).
        
# -type dyad_type (dr|ir|any) any   (default)
        #In order to fasten execution, the program can be asked to restrict its analysis to symmetric dyads.
        #Three types are accepted
           #dr  direct repeats: the second element is the same as the first one
           #ir  inverted repeats: the second element is the reverse complement of the first one.
           #rep  repeats: direct and inverted repeats are evaluated
           #any (default)
             #When selecting the option any, the analysis is performed on all non-symmetric dyads as well.

View the top patterns ranked by obs/exp ratio:

cat GATA3_peaks_without_motifs_123456_78_161win_mast_RSAT_dyad.txt | sort -k8,8nr | head
## agan{0}taa   agan{0}taa|ttan{0}tct   0.0000096090146    1258   55.24 0   1258      22.77
## agan{0}tta   agan{0}tta|taan{0}tct   0.0001052931460    2453  605.36 22  2475       4.05
## cttn{0}atc   cttn{0}atc|gatn{0}aag   0.0000784961757    1618  451.29 1   1619       3.59
## gatn{0}aac   gatn{0}aac|gttn{0}atc   0.0000457443231     894  263.00 0   894    3.40
## atcn{9}atc   atcn{9}atc|gatn{9}gat   0.0002062495205    3746 1105.62 298 4044       3.39
## ccgn{0}ccg   ccgn{0}ccg|cggn{0}cgg   0.0003669019521    6932 2109.41 1088    8020       3.29
## gatn{10}ata  gatn{10}ata|tatn{10}atc 0.0001643987710    2878  874.66 142 3020       3.29
## gatn{11}ata  gatn{11}ata|tatn{11}atc 0.0001563266321    2714  825.43 78  2792       3.29
## attn{0}atc   attn{0}atc|gatn{0}aat   0.0000619849111    1141  356.37 0   1141       3.20
## atcn{10}atc  atcn{10}atc|gatn{10}gat 0.0002081405868    3470 1107.39 178 3648       3.13

The top significant pattern is a single site AGATAA.

6.7 Is seqoutbias generated 3mer coordinates specific enough for us to detect all 3mer combinations?

In GATA3 peaks exhibiting motif 1 enrichment, we anticipate pinpointing peaks with a GAT sequence proximal to their summit, given their characterization as enriched with a GATA3-like motif 1. However, upon sorting the closest GAT sequences to the summit within the “.1st.plus.GAT.to.GATA3.with.motif_1.bed” file by their distances, we observed significant deviations in certain instances.

For instance, consider GATA_ChIP_peak_24656, which was identified as enriched within a 101bp window containing motif 1. Surprisingly, the closest GAT sequence is positioned 581672bp away. Upon visual inspection using the UCSC genome browser, we observed a GAT sequence located 23bp downstream of the peak summit, followed by an ATC sequence at a spacing of 3bp, consistent with the expected pattern for motif 1-enriched peaks.

The discrepancy in identifying these GAT coordinates by Seqoutbias, which utilizes a read size of 1000, suggests a potential limitation in its approach to searching within the given genome read size. Further investigation is warranted to understand the underlying reasons for this inconsistency.

UCSC genome browser section: https://genome.ucsc.edu/s/ssun/3mer_coordinates_fail

It could due to the mappability of the sequence itself, so seqOutbias is not generating the coordinates for that sequences.

check if this region falls into the highly repetitive elements (RepeatMasker)
Blat this sequence, and found the sequence map to two regions of the genome. Since this sequence is not uniquely mappable, it will not generate the 3mer.

Quantify how many peaks contain their closest GAT at distance >200bp from peak summit.

#abs.dis=200
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.GATA3.with.motif_1.bed
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.GATA3.with.motif_2.bed
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.GATA3.with.motif_4.bed
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.GATA3.with.motif_5.bed
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.GATA3.with.motif_6.bed
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.GATA3_without_motifs_quantile1.bed
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.indep.DHS.control.consensus.bed
## 67
## 53
## 90
## 19
## 50
## 1664
## 120
## 2829
wc -l closest.1st.plus.GAT.to.GATA3.with.motif_1.bed
wc -l closest.1st.plus.GAT.to.GATA3.with.motif_2.bed
wc -l closest.1st.plus.GAT.to.GATA3.with.motif_4.bed
wc -l closest.1st.plus.GAT.to.GATA3.with.motif_5.bed
wc -l closest.1st.plus.GAT.to.GATA3.with.motif_6.bed
wc -l closest.1st.plus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
wc -l closest.1st.plus.GAT.to.GATA3_without_motifs_quantile1.bed
wc -l closest.1st.plus.GAT.to.indep.DHS.control.consensus.bed
##    12470 closest.1st.plus.GAT.to.GATA3.with.motif_1.bed
##    11475 closest.1st.plus.GAT.to.GATA3.with.motif_2.bed
##     6505 closest.1st.plus.GAT.to.GATA3.with.motif_4.bed
##     4167 closest.1st.plus.GAT.to.GATA3.with.motif_5.bed
##     5363 closest.1st.plus.GAT.to.GATA3.with.motif_6.bed
##    37308 closest.1st.plus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
##     7462 closest.1st.plus.GAT.to.GATA3_without_motifs_quantile1.bed
##    57906 closest.1st.plus.GAT.to.indep.DHS.control.consensus.bed

67/12470=0.005372895
53/11475=0.004618736
90/6505=0.01383551
19/4167=0.004559635
50/5363=0.00932314

The occurrences of our positive control peak set having a closest GAT at a distance larger than 200bp from the peak summit are less than 1%.

The occurrences of the negative control having a closest GAT outside the 400bp window around summit is 4.88%.

The occurences of all peaks without motifs having a closest GAT outside the 400bp window around summit is 4.46%.
The top 20% quantile of peaks without motifs that has a higher peak intensity is 1.6%.

6.7.1 stack bar

df <- data.frame(
  peak_set = c("GATA3.with.motif_1", "GATA3.with.motif_2", "GATA3.with.motif_4", "GATA3.with.motif_5", "GATA3.with.motif_6", "GATA3_without_motifs", "GATA3_without_motifs_quantile1", "indep.DHS.control.consensus"),
  out_counts = c(67, 53, 90, 19, 50, 1664, 120, 2829),
  all_counts = c(12470, 11475, 6505, 4167, 5363, 37308, 7462, 57906)
)
df$in_counts=df$all_counts-df$out_counts
df$peak_set=factor(df$peak_set, levels = c("GATA3.with.motif_1", "GATA3.with.motif_2", "GATA3.with.motif_4", "GATA3.with.motif_5", "GATA3.with.motif_6", "GATA3_without_motifs", "GATA3_without_motifs_quantile1", "indep.DHS.control.consensus"))

library(lattice)
barchart(in_counts + out_counts ~ peak_set,
         stack=TRUE,
         data = df, 
         horizontal = FALSE, 
         col = c("lightblue", "pink"), 
         key = list(space = "right", rectangles=list(pch = c(15, 15), col = c("pink", "lightblue")), text=list(c("closest abs distance >200bp", "closest abs distance <200bp"), cex=1)),
         ylab="number of peaks",
         scales=list(x=list(rot=45, cex=0.6)))

6.7.2 Violin/bw plot for 1st closest GAT distances

To see how often these deviations happen, we can quantify the 1st closest GAT distances to GATA3 peak with motifs (positive controls), and see how distances data distributed within each peak set:

plus-GAT

ls closest.1st.plus.GAT.*.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_1.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_2.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_4.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_5.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_6.bed
## closest.1st.plus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.1st.plus.GAT.to.GATA3_without_motifs_quantile1.bed
## closest.1st.plus.GAT.to.indep.DHS.control.consensus.bed
df.plot = data.frame(matrix(nrow = 0, ncol = 4))     
colnames(df.plot) = c("dis","anchor_3mer", "peak_set","abs.dis")
for (closest_1st_dis in Sys.glob(file.path("./closest.1st.plus.GAT.to.*.bed"))) {
    print(closest_1st_dis)
    anchor_3mer ="plus.GAT"
    peak_set =strsplit((strsplit(strsplit(closest_1st_dis, "/")[[1]][length(strsplit(closest_1st_dis, "/")[[1]])], 'closest.1st.plus.GAT.to.')[[1]][2]), ".bed")[[1]][1]
    print(peak_set)
    temp = as.data.frame(cbind(read.table(closest_1st_dis, header=F, comment.char='')[,11], anchor_3mer, peak_set)) 
    colnames(temp) = c("dis", "anchor_3mer", "peak_set")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    df.plot = rbind(df.plot,temp)
}
## [1] "./closest.1st.plus.GAT.to.GATA3.with.motif_1.bed"
## [1] "GATA3.with.motif_1"
## [1] "./closest.1st.plus.GAT.to.GATA3.with.motif_2.bed"
## [1] "GATA3.with.motif_2"
## [1] "./closest.1st.plus.GAT.to.GATA3.with.motif_4.bed"
## [1] "GATA3.with.motif_4"
## [1] "./closest.1st.plus.GAT.to.GATA3.with.motif_5.bed"
## [1] "GATA3.with.motif_5"
## [1] "./closest.1st.plus.GAT.to.GATA3.with.motif_6.bed"
## [1] "GATA3.with.motif_6"
## [1] "./closest.1st.plus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "GATA3_without_motifs_123456_78_161bp_mast"
## [1] "./closest.1st.plus.GAT.to.GATA3_without_motifs_quantile1.bed"
## [1] "GATA3_without_motifs_quantile1"
## [1] "./closest.1st.plus.GAT.to.indep.DHS.control.consensus.bed"
## [1] "indep.DHS.control.consensus"
df.plot$peak_set = factor(df.plot$peak_set, levels = c("GATA3.with.motif_1", "GATA3.with.motif_2", "GATA3.with.motif_4", "GATA3.with.motif_5", "GATA3.with.motif_6", "GATA3_without_motifs_123456_78_161bp_mast", "GATA3_without_motifs_quantile1", "indep.DHS.control.consensus"))
summary(df.plot)
##       dis           anchor_3mer       
##  Min.   :     0.0   Length:142656     
##  1st Qu.:     8.0   Class :character  
##  Median :    24.0   Mode  :character  
##  Mean   :   585.7                     
##  3rd Qu.:    59.0                     
##  Max.   :734003.0                     
##                                       
##                                       peak_set        abs.dis        
##  indep.DHS.control.consensus              :57906   Min.   :     0.0  
##  GATA3_without_motifs_123456_78_161bp_mast:37308   1st Qu.:     8.0  
##  GATA3.with.motif_1                       :12470   Median :    24.0  
##  GATA3.with.motif_2                       :11475   Mean   :   585.7  
##  GATA3_without_motifs_quantile1           : 7462   3rd Qu.:    59.0  
##  GATA3.with.motif_4                       : 6505   Max.   :734003.0  
##  (Other)                                  : 9530
str(df.plot)
## 'data.frame':    142656 obs. of  4 variables:
##  $ dis        : int  10 16 0 0 3 3 3 0 2 20 ...
##  $ anchor_3mer: chr  "plus.GAT" "plus.GAT" "plus.GAT" "plus.GAT" ...
##  $ peak_set   : Factor w/ 8 levels "GATA3.with.motif_1",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ abs.dis    : int  10 16 0 0 3 3 3 0 2 20 ...
library(lattice)
library(latticeExtra)

my.settings <- list(
  superpose.polygon=list(col=c("black", "grey"), border="transparent"),
  strip.background=list(col="grey80", cex = 0.6),
  strip.border=list(col="black")
)

bwplot(log10(abs.dis) ~ peak_set,
       data=df.plot,
       do.out=FALSE,
       ylim=c(-1, 7),
       par.settings = my.settings,
       panel = function(...) {
             panel.abline(h= 1.9, lty =2, col="red") # log10(80)~1.9
             panel.abline(h= 1.3, lty =2, col="blue") # log10(20)~1.3
             panel.stripplot(...,jitter.data = T, pch=4, cex=0.4, col="grey")
             #panel.violin(..., col="transparent", varwidth = FALSE)
             panel.bwplot(...)},
       scales=list(x=list(rot=45, cex=0.6)),
       xlab = "peak set", 
       ylab = "closest plus GAT to peak summit log10(abs.dis)")

my.settings <- list(
  superpose.polygon=list(col=c("black", "grey"), border="transparent"),
  strip.background=list(col="grey80", cex = 0.6),
  strip.border=list(col="black")
)
bwplot(log10(abs.dis) ~ peak_set,
       data=df.plot,
       do.out=FALSE,
       ylim=c(-1, 7),
       par.settings = my.settings,
       scales=list(x=list(rot=45, cex=0.6)),
       xlab = "peak set", 
       ylab = "closest plus GAT to peak summit log10(abs.dis)",
       panel = function(...) {
             #panel.bwplot(...)
             panel.abline(h= 2.3, lty =2, col="grey40") # log10(200)~2.3
             panel.abline(h= 1.9, lty =2, col="red") # log10(80)~1.9
             panel.abline(h= 1.3, lty =2, col="blue") # log10(20)~1.3
             panel.stripplot(..., jitter.data = TRUE, pch=4, cex=0.4, col="grey")
             panel.violin(..., col="transparent", border="black", do.out = FALSE, varwidth = FALSE)
             }
       )

bwplot(log10(abs.dis) ~ peak_set,
       data=df.plot,
       #do.out=FALSE,
       ylim=c(-1, 7),
       panel = function(..., box.ratio) {
             panel.abline(h= 1.9, lty =2, col="red") # log10(80)~1.9
             panel.abline(h= 1.3, lty =2, col="blue") # log10(20)~1.3
             panel.stripplot(...,jitter.data = T, pch=4, cex=0.4, col="grey")
             panel.violin(..., col="pink", alpha=0.4, box.ratio = box.ratio, do.out = F, border="black", varwidth = TRUE)
             },
       scales=list(x=list(rot=45, cex=0.6)),
       xlab = "peak set", 
       ylab = "closest plus GAT to peak summit log10(abs.dis)")

df.plot1=df.plot[df.plot$peak_set %in% c("GATA3.with.motif_1", "GATA3.with.motif_2", "GATA3.with.motif_4", "GATA3.with.motif_5", "GATA3.with.motif_6","indep.DHS.control.consensus"), ]

bwplot(log10(abs.dis) ~ peak_set,
       data=df.plot1,
       #do.out=FALSE,
       ylim=c(-1, 7),
       panel = function(..., box.ratio) {
             panel.abline(h= 2.3, lty =2, col="grey40") # log10(200)~2.3
             panel.abline(h= 1.9, lty =2, col="red") # log10(80)~1.9
             panel.abline(h= 1.3, lty =2, col="blue") # log10(20)~1.3
             panel.stripplot(...,jitter.data = T, pch=4, cex=0.4, col="grey")
             panel.violin(..., col="pink", alpha=0.4, box.ratio = box.ratio, do.out = F, border="black", varwidth = TRUE)
             },
       scales=list(x=list(rot=45, cex=0.6)),
       xlab = "peak set", 
       ylab = "closest plus GAT to peak summit log10(abs.dis)")

We can observe that for GATA3 peaks with motifs 1, 4, and 5, most peaks have a closest GAT within a 161bp window around the peak summit (log10(80)=1.9, marked by the red dashed line). Additionally, for all positive GATA3 peak sets, the majority of distances fall within a 40bp window around the summit (log10(20)=1.3, marked by the blue dashed line).

In the negative control, the first GAT appears to be more distant from the peak summit.

plus-ATC (same as minus-GAT)

ls closest.1st.minus.GAT.*.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_1.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_2.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_4.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_5.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_6.bed
## closest.1st.minus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.1st.minus.GAT.to.GATA3_without_motifs_quantile1.bed
## closest.1st.minus.GAT.to.indep.DHS.control.consensus.bed
df.plot = data.frame(matrix(nrow = 0, ncol = 4))     
colnames(df.plot) = c("dis","anchor_3mer", "peak_set","abs.dis")
for (closest_1st_dis in Sys.glob(file.path("./closest.1st.minus.GAT.to.*.bed"))) {
    print(closest_1st_dis)
    anchor_3mer ="minus.GAT"
    peak_set =strsplit((strsplit(strsplit(closest_1st_dis, "/")[[1]][length(strsplit(closest_1st_dis, "/")[[1]])], 'closest.1st.minus.GAT.to.')[[1]][2]), ".bed")[[1]][1]
    print(peak_set)
    temp = as.data.frame(cbind(read.table(closest_1st_dis, header=F, comment.char='')[,11], anchor_3mer, peak_set)) 
    colnames(temp) = c("dis", "anchor_3mer", "peak_set")
    temp$dis=as.integer(temp$dis)
    temp$abs.dis=abs(temp$dis)
    df.plot = rbind(df.plot,temp)
}
## [1] "./closest.1st.minus.GAT.to.GATA3.with.motif_1.bed"
## [1] "GATA3.with.motif_1"
## [1] "./closest.1st.minus.GAT.to.GATA3.with.motif_2.bed"
## [1] "GATA3.with.motif_2"
## [1] "./closest.1st.minus.GAT.to.GATA3.with.motif_4.bed"
## [1] "GATA3.with.motif_4"
## [1] "./closest.1st.minus.GAT.to.GATA3.with.motif_5.bed"
## [1] "GATA3.with.motif_5"
## [1] "./closest.1st.minus.GAT.to.GATA3.with.motif_6.bed"
## [1] "GATA3.with.motif_6"
## [1] "./closest.1st.minus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "GATA3_without_motifs_123456_78_161bp_mast"
## [1] "./closest.1st.minus.GAT.to.GATA3_without_motifs_quantile1.bed"
## [1] "GATA3_without_motifs_quantile1"
## [1] "./closest.1st.minus.GAT.to.indep.DHS.control.consensus.bed"
## [1] "indep.DHS.control.consensus"
df.plot$peak_set = factor(df.plot$peak_set, levels = c("GATA3.with.motif_1", "GATA3.with.motif_2", "GATA3.with.motif_4", "GATA3.with.motif_5", "GATA3.with.motif_6", "GATA3_without_motifs_123456_78_161bp_mast", "GATA3_without_motifs_quantile1", "indep.DHS.control.consensus"))
summary(df.plot)
##       dis           anchor_3mer       
##  Min.   :     0.0   Length:142656     
##  1st Qu.:     9.0   Class :character  
##  Median :    24.0   Mode  :character  
##  Mean   :   606.2                     
##  3rd Qu.:    59.0                     
##  Max.   :890194.0                     
##                                       
##                                       peak_set        abs.dis        
##  indep.DHS.control.consensus              :57906   Min.   :     0.0  
##  GATA3_without_motifs_123456_78_161bp_mast:37308   1st Qu.:     9.0  
##  GATA3.with.motif_1                       :12470   Median :    24.0  
##  GATA3.with.motif_2                       :11475   Mean   :   606.2  
##  GATA3_without_motifs_quantile1           : 7462   3rd Qu.:    59.0  
##  GATA3.with.motif_4                       : 6505   Max.   :890194.0  
##  (Other)                                  : 9530
str(df.plot)
## 'data.frame':    142656 obs. of  4 variables:
##  $ dis        : int  16 10 18 46 1 8 1 5 57 18 ...
##  $ anchor_3mer: chr  "minus.GAT" "minus.GAT" "minus.GAT" "minus.GAT" ...
##  $ peak_set   : Factor w/ 8 levels "GATA3.with.motif_1",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ abs.dis    : int  16 10 18 46 1 8 1 5 57 18 ...
bwplot(log10(abs.dis) ~ peak_set,
       data=df.plot,
       #do.out=FALSE,
       ylim=c(-1, 7),
       panel = function(..., box.ratio) {
             panel.abline(h= 1.9, lty =2, col="red") # log10(80)~1.9
             panel.abline(h= 1.3, lty =2, col="blue") # log10(20)~1.3
             panel.stripplot(...,jitter.data = T, pch=4, cex=0.4, col="grey")
             panel.violin(...,col="lightblue", alpha=0.4, border="black", varwidth = TRUE)
             },
       scales=list(x=list(rot=45, cex=0.6)),
       xlab = "peak set", 
       ylab = "closest minus GAT to peak summit log10(abs.dis)")

df.plot1=df.plot[df.plot$peak_set %in% c("GATA3.with.motif_1", "GATA3.with.motif_2", "GATA3.with.motif_4", "GATA3.with.motif_5", "GATA3.with.motif_6","indep.DHS.control.consensus"), ]

bwplot(log10(abs.dis) ~ peak_set,
       data=df.plot1,
       #do.out=FALSE,
       ylim=c(-1, 7),
       panel = function(..., box.ratio) {
             #panel.abline(h= 2.3, lty =2, col="grey40") # log10(200)~2.3
             panel.abline(h= 1.9, lty =2, col="red") # log10(80)~1.9
             panel.abline(h= 1.3, lty =2, col="blue") # log10(20)~1.3
             panel.stripplot(...,jitter.data = TRUE, pch=4, cex=0.4, col="grey")
             panel.violin(..., col="lightblue", alpha=0.4, box.ratio = box.ratio, do.out = FALSE, border="black", varwidth = FALSE)
             },
       scales=list(x=list(rot=45, cex=0.6)),
       xlab = "peak set", 
       ylab = "closest minus GAT to peak summit log10(abs.dis)")

7 loop through other 3mer for GATA3 peaks without motif

7.0.1 Closest 3mer to peak summits (~2.5h running time)

Goal: given a prioritized 3mer list, we can generate the closest 3mer coordinates to a given sets of peak summits.

240225_closestBed.R:

(cd /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/closest_other_3mer)

#!/usr/bin/env Rscript

Args=commandArgs(TRUE)
# closestBed function
bedTools.closest.mod <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
  
  options(scipen =99) # not use scientific notation when writing out
  
  #write bed formatted data.frames to tempfile
  write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
  write.table(bed2,file= 'b.file.sorted.bed', quote=F,sep="\t",col.names=F,row.names=F)
  
  # create the command string and call the command using system()
  # the command sort a and b file by coordinates
  command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
  cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
  try(system(command1))
  #command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
  #cat(command2,"\n")
  #try(system(command2))
  
  # the command call closestBed on bed1 and bed2
  command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
  cat(command,"\n")
  try(system(command))
  
  res=read.table('out.file.bed',header=F, comment.char='')
  
  # remove intermediate files
  command3=paste('rm', 'a.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
  cat(command3,"\n")
  try(system(command3))
  
  colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
  return(res)
}

dir1="/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/"
dir2="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/"

prioritized_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")

library(bigWig)

for (triplet in prioritized_triplets){
  print(triplet)
  # 3mer genome coordinates
  plus.triplet.file=read.table(file = Sys.glob(file.path(paste0(dir1,"hg38.3.3.3plus.*_",triplet, ".sorted.bed"))), sep="\t", header=FALSE)

    # peak summits
    GATA3_peak_summits=center.bed(read.table(paste0(dir2, "without_motifs_123456_78_161bp_mast.bed"), header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
  
    # closestBed--1st closest plus

    closest.1st.plus.triplet.to.peak=bedTools.closest.mod(bed1 = GATA3_peak_summits[,1:3], bed2 = plus.triplet.file, opt.string = '-d -t first')
    write.table(closest.1st.plus.triplet.to.peak,file= paste0('closest.1st.plus.',triplet,'.to.GATA3_without_motifs_123456_78_161bp_mast.bed'), quote=F,sep="\t",col.names=F,row.names=F)
    
  
}

runR.sh

#!/bin/bash
#SBATCH --job-name=runR.sh     # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=128G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR.sh_%j.out
#SBATCH -e runR.sh_%j.err

module load R/4.1.2
Rscript 240225_closestBed.R

7.0.1.1 STEP2: remove the closest 3mer coordinates from the whole genome 3mer coordinates on the same strand with bedtools subtract.

#!/bin/bash
#SBATCH --job-name=remove_1st_3mer.sh     # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o remove_1st_3mer.sh_%j.out
#SBATCH -e remove_1st_3mer.sh_%j.err


input_dir1=/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/
input_dir2=/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/closest_other_3mer/


prioritized_triplets=("AAA" "TAA" "ATA" "TTA" "AAT" "TAT" "GAT" "ATT" "TTT" "ATC" "AGA" "TCT" "TAG" "CTA")

module load bedtools 

for triplet in "${prioritized_triplets[@]}"
do
  echo $triplet
  
    # plus
    awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.bed  | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed
    
    bedtools subtract -a ${input_dir1}hg38.3.3.3plus.*${triplet}.sorted.bed -b closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3plus.${triplet}_without_1st_plus_${triplet}_to_GATA3_without_motifs_123456_78_161bp_mast.bed
  
    rm closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed
  
done

7.0.1.2 STEP3

240226_closestBed_peak_without_mot.R

#!/usr/bin/env Rscript

Args=commandArgs(TRUE)

bedTools.closest.mod <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
  
  options(scipen =99) # not use scientific notation when writing out
  
  #write bed formatted data.frames to tempfile
  write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
  write.table(bed2,file= 'b.file.sorted.bed', quote=F,sep="\t",col.names=F,row.names=F)
  
  # create the command string and call the command using system()
  # the command sort a and b file by coordinates
  command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
  cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
  try(system(command1))
  #command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
  #cat(command2,"\n")
  #try(system(command2))
  
  # the command call closestBed on bed1 and bed2
  command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
  cat(command,"\n")
  try(system(command))
  
  res=read.table('out.file.bed',header=F, comment.char='')
  
  # remove intermediate files
  command3=paste('rm', 'a.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
  cat(command3,"\n")
  try(system(command3))
  
  colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
  return(res)
}

library(bigWig)

# List of prioritized triplets
prioritized_triplets <- c("AAA", "TAA", "ATA", "TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")

# List to store non-redundant 6mers
all_6mers <- list()
for (i in 1:length(prioritized_triplets)) {
  for (j in 1:length(prioritized_triplets)) {
    pair <- c(prioritized_triplets[i], prioritized_triplets[j])
    # Combine triplets to form a 6mer
    sixmer <- paste(pair, collapse = "")
    all_6mers <- c(all_6mers, list(sixmer))
    
  }
}

# Create data frame with first 3 bases and last 3 bases
first_3_bases <- substr(all_6mers, 1, 3)
last_3_bases <- substr(all_6mers, 4, 6)
df <- data.frame(First_3_bases = first_3_bases, Last_3_bases = last_3_bases)


dir1="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/closest_other_3mer/"
dir2="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/closest_other_3mer/closest_2nd_other_3mer/"


# DHS regions
for (i in 1:nrow(df)){
    pattern1=df[i,1]
    pattern2=df[i,2]
    # anchor position: closest +/- pattern1 
    print(pattern1)
    closest_plus_3mer_to_peak=fiveprime.bed(read.table(paste0(dir1, "closest.1st.plus.", pattern1, ".to.GATA3_without_motifs_123456_78_161bp_mast.bed"), header=FALSE)[,4:11], upstreamWindow = 0, downstreamWindow = 0)
  
    # query 3mer coordinates on genome (without the overlapped closest 3mer coordinates) 
    print(pattern2)
    plus.3mer.file=fiveprime.bed(read.table(file = paste0(dir2, "hg38.3.3.3plus.", pattern2, "_without_1st_plus_", pattern2, "_to_GATA3_without_motifs_123456_78_161bp_mast.bed"), sep="\t", header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
  
  # 2nd closest plus 3mer to closest plus 3mer
  closest.2nd.plus.3mer.to.1st.plus.3mer=bedTools.closest.mod(bed1 = closest_plus_3mer_to_peak[,1:3], bed2 = plus.3mer.file, opt.string = '-d -t first')

  
  write.table(closest.2nd.plus.3mer.to.1st.plus.3mer, file= paste0("closest.2nd.plus.", pattern2, ".to.1st.plus.", pattern1, ".to.GATA3_without_motifs_123456_78.bed"), quote=F,sep="\t",col.names=F,row.names=F)
  }

runR_peak_without_mot.sh

#!/bin/bash
#SBATCH --job-name=runR_peak_without_mot.sh    # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=128G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR_peak_without_mot.sh_%j.out
#SBATCH -e runR_peak_without_mot.sh_%j.err

hostname
mkdir peak_without_mot
cd peak_without_mot

module load R/4.1.2
Rscript ../240226_closestBed_peak_without_mot.R